Skip to content

Commit 5774388

Browse files
committed
Revert "[LSV] Refactoring + supporting bitcasts to a type of different size"
This reverts commit r337489. It causes asserts to fire in some TensorFlow tests, e.g. tensorflow/compiler/tests/gather_test.py on GPU. Example stack trace: Start test case: GatherTest.testHigherRank assertion failed at third_party/llvm/llvm/lib/Support/APInt.cpp:819 in llvm::APInt llvm::APInt::trunc(unsigned int) const: width && "Can't truncate to 0 bits" @ 0x5559446ebe10 __assert_fail @ 0x55593ef32f5e llvm::APInt::trunc() @ 0x55593d78f86e (anonymous namespace)::Vectorizer::lookThroughComplexAddresses() @ 0x55593d78f2bc (anonymous namespace)::Vectorizer::areConsecutivePointers() @ 0x55593d78d128 (anonymous namespace)::Vectorizer::isConsecutiveAccess() @ 0x55593d78c926 (anonymous namespace)::Vectorizer::vectorizeInstructions() @ 0x55593d78c221 (anonymous namespace)::Vectorizer::vectorizeChains() @ 0x55593d78b948 (anonymous namespace)::Vectorizer::run() @ 0x55593d78b725 (anonymous namespace)::LoadStoreVectorizer::runOnFunction() @ 0x55593edf4b17 llvm::FPPassManager::runOnFunction() @ 0x55593edf4e55 llvm::FPPassManager::runOnModule() @ 0x55593edf563c (anonymous namespace)::MPPassManager::runOnModule() @ 0x55593edf5137 llvm::legacy::PassManagerImpl::run() @ 0x55593edf5b71 llvm::legacy::PassManager::run() @ 0x55593ced250d xla::gpu::IrDumpingPassManager::run() @ 0x55593ced5033 xla::gpu::(anonymous namespace)::EmitModuleToPTX() @ 0x55593ced40ba xla::gpu::(anonymous namespace)::CompileModuleToPtx() @ 0x55593ced33d0 xla::gpu::CompileToPtx() @ 0x55593b26b2a2 xla::gpu::NVPTXCompiler::RunBackend() @ 0x55593b21f973 xla::Service::BuildExecutable() @ 0x555938f44e64 xla::LocalService::CompileExecutable() @ 0x555938f30a85 xla::LocalClient::Compile() @ 0x555938de3c29 tensorflow::XlaCompilationCache::BuildExecutable() @ 0x555938de4e9e tensorflow::XlaCompilationCache::CompileImpl() @ 0x555938de3da5 tensorflow::XlaCompilationCache::Compile() @ 0x555938c5d962 tensorflow::XlaLocalLaunchBase::Compute() @ 0x555938c68151 tensorflow::XlaDevice::Compute() @ 0x55593f389e1f tensorflow::(anonymous namespace)::ExecutorState::Process() @ 0x55593f38a625 tensorflow::(anonymous namespace)::ExecutorState::ScheduleReady()::$_1::operator()() *** SIGABRT received by PID 7798 (TID 7837) from PID 7798; *** llvm-svn: 337541
1 parent 99a9f75 commit 5774388

File tree

3 files changed

+50
-97
lines changed

3 files changed

+50
-97
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 46 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ class Vectorizer {
118118
bool run();
119119

120120
private:
121+
GetElementPtrInst *getSourceGEP(Value *Src) const;
122+
121123
unsigned getPointerAddressSpace(Value *I);
122124

123125
unsigned getAlignment(LoadInst *LI) const {
@@ -137,8 +139,6 @@ class Vectorizer {
137139
}
138140

139141
bool isConsecutiveAccess(Value *A, Value *B);
140-
bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size);
141-
bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta);
142142

143143
/// After vectorization, reorder the instructions that I depends on
144144
/// (the instructions defining its operands), to ensure they dominate I.
@@ -277,6 +277,21 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
277277
return -1;
278278
}
279279

280+
GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
281+
// First strip pointer bitcasts. Make sure pointee size is the same with
282+
// and without casts.
283+
// TODO: a stride set by the add instruction below can match the difference
284+
// in pointee type size here. Currently it will not be vectorized.
285+
Value *SrcPtr = getLoadStorePointerOperand(Src);
286+
Value *SrcBase = SrcPtr->stripPointerCasts();
287+
Type *SrcPtrType = SrcPtr->getType()->getPointerElementType();
288+
Type *SrcBaseType = SrcBase->getType()->getPointerElementType();
289+
if (SrcPtrType->isSized() && SrcBaseType->isSized() &&
290+
DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType))
291+
SrcPtr = SrcBase;
292+
return dyn_cast<GetElementPtrInst>(SrcPtr);
293+
}
294+
280295
// FIXME: Merge with llvm::isConsecutiveAccess
281296
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
282297
Value *PtrA = getLoadStorePointerOperand(A);
@@ -289,6 +304,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
289304
return false;
290305

291306
// Make sure that A and B are different pointers of the same size type.
307+
unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
292308
Type *PtrATy = PtrA->getType()->getPointerElementType();
293309
Type *PtrBTy = PtrB->getType()->getPointerElementType();
294310
if (PtrA == PtrB ||
@@ -298,16 +314,10 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
298314
DL.getTypeStoreSize(PtrBTy->getScalarType()))
299315
return false;
300316

301-
unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
302317
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
303318

304-
return areConsecutivePointers(PtrA, PtrB, Size);
305-
}
306-
307-
bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
308-
unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
309-
APInt OffsetA(PtrBitWidth, 0);
310-
APInt OffsetB(PtrBitWidth, 0);
319+
unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
320+
APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
311321
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
312322
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
313323

@@ -341,94 +351,68 @@ bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
341351
// Sometimes even this doesn't work, because SCEV can't always see through
342352
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
343353
// things the hard way.
344-
return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta);
345-
}
346-
347-
bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
348-
APInt PtrDelta) {
349-
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
350-
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
351-
if (!GEPA || !GEPB)
352-
return false;
353354

354355
// Look through GEPs after checking they're the same except for the last
355356
// index.
356-
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
357-
GEPA->getPointerOperand() != GEPB->getPointerOperand())
357+
GetElementPtrInst *GEPA = getSourceGEP(A);
358+
GetElementPtrInst *GEPB = getSourceGEP(B);
359+
if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
358360
return false;
359-
gep_type_iterator GTIA = gep_type_begin(GEPA);
360-
gep_type_iterator GTIB = gep_type_begin(GEPB);
361-
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
362-
if (GTIA.getOperand() != GTIB.getOperand())
361+
unsigned FinalIndex = GEPA->getNumOperands() - 1;
362+
for (unsigned i = 0; i < FinalIndex; i++)
363+
if (GEPA->getOperand(i) != GEPB->getOperand(i))
363364
return false;
364-
++GTIA;
365-
++GTIB;
366-
}
367365

368-
Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
369-
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
366+
Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
367+
Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
370368
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
371369
OpA->getType() != OpB->getType())
372370
return false;
373371

374-
if (PtrDelta.isNegative()) {
375-
if (PtrDelta.isMinSignedValue())
376-
return false;
377-
PtrDelta.negate();
378-
std::swap(OpA, OpB);
379-
}
380-
uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
381-
if (PtrDelta.urem(Stride) != 0)
382-
return false;
383-
unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
384-
APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
385-
386372
// Only look through a ZExt/SExt.
387373
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
388374
return false;
389375

390376
bool Signed = isa<SExtInst>(OpA);
391377

392-
// At this point A could be a function parameter, i.e. not an instruction
393-
Value *ValA = OpA->getOperand(0);
378+
OpA = dyn_cast<Instruction>(OpA->getOperand(0));
394379
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
395-
if (!OpB || ValA->getType() != OpB->getType())
380+
if (!OpA || !OpB || OpA->getType() != OpB->getType())
396381
return false;
397382

398-
// Now we need to prove that adding IdxDiff to ValA won't overflow.
383+
// Now we need to prove that adding 1 to OpA won't overflow.
399384
bool Safe = false;
400-
// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
401-
// ValA, we're okay.
385+
// First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
386+
// we're okay.
402387
if (OpB->getOpcode() == Instruction::Add &&
403388
isa<ConstantInt>(OpB->getOperand(1)) &&
404-
IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
389+
cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
405390
if (Signed)
406391
Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
407392
else
408393
Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
409394
}
410395

411-
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
396+
unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
412397

413398
// Second attempt:
414-
// If all set bits of IdxDiff or any higher order bit other than the sign bit
415-
// are known to be zero in ValA, we can add Diff to it while guaranteeing no
416-
// overflow of any sort.
399+
// If any bits are known to be zero other than the sign bit in OpA, we can
400+
// add 1 to it while guaranteeing no overflow of any sort.
417401
if (!Safe) {
418-
OpA = dyn_cast<Instruction>(ValA);
419-
if (!OpA)
420-
return false;
421402
KnownBits Known(BitWidth);
422403
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
423-
if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff))
424-
return false;
404+
if (Known.countMaxTrailingOnes() < (BitWidth - 1))
405+
Safe = true;
425406
}
426407

427-
const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
408+
if (!Safe)
409+
return false;
410+
411+
const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
428412
const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
429-
const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
430-
const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
431-
return X == OffsetSCEVB;
413+
const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
414+
const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
415+
return X2 == OffsetSCEVB;
432416
}
433417

434418
void Vectorizer::reorder(Instruction *I) {
Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
; RUN: opt -mtriple x86_64-- -load-store-vectorizer < %s -S | FileCheck %s
1+
; RUN: opt -load-store-vectorizer < %s -S | FileCheck %s
22

33
%struct_render_pipeline_state = type opaque
44

5-
define fastcc void @test1(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
6-
; CHECK-LABEL: @test1
5+
define fastcc void @main(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
76
; CHECK: load i16
87
; CHECK: load i16
98
entry:
@@ -15,16 +14,3 @@ entry:
1514
%tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2
1615
ret void
1716
}
18-
19-
define fastcc void @test2(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr {
20-
; CHECK-LABEL: @test2
21-
; CHECK: load <2 x i16>
22-
entry:
23-
%tmp = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i16 addrspace(1)*
24-
%tmp1 = load i16, i16 addrspace(1)* %tmp, align 2
25-
%tmp2 = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i8 addrspace(1)*
26-
%sunkaddr51 = getelementptr i8, i8 addrspace(1)* %tmp2, i64 2
27-
%tmp3 = bitcast i8 addrspace(1)* %sunkaddr51 to i16 addrspace(1)*
28-
%tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2
29-
ret void
30-
}

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba
5656
ret void
5757
}
5858

59+
; TODO: This can be vectorized, but currently vectorizer unable to do it.
5960
; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx
60-
; CHECK: load <4 x i32>
6161
define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
6262
%add1 = add nuw i32 %base, 0
6363
%zext1 = zext i32 %add1 to i64
@@ -74,27 +74,10 @@ define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba
7474
%gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
7575
%f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
7676
%load3 = load i32, i32 addrspace(1)* %f2i3, align 4
77-
%add4 = add nuw i32 %base, 12
77+
%add4 = add nuw i32 %base, 16
7878
%zext4 = zext i32 %add4 to i64
7979
%gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
8080
%f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
8181
%load4 = load i32, i32 addrspace(1)* %f2i4, align 4
8282
ret void
8383
}
84-
85-
; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta
86-
; CHECK: load <2 x i32>
87-
define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) {
88-
%p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)*
89-
%a.offset = add nuw i32 %base, 4
90-
%t.offset.zexted = zext i32 %base to i64
91-
%a.offset.zexted = zext i32 %a.offset to i64
92-
%t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted
93-
%a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted
94-
%b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6
95-
%a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)*
96-
%b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)*
97-
%a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted
98-
%b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted
99-
ret void
100-
}

0 commit comments

Comments
 (0)