Skip to content

Commit 2e794a4

Browse files
committed
[AArch64] Stack frame reordering.
Implement stack frame reordering in the AArch64 backend. Unlike the X86 implementation, AArch64 does not seem to benefit from "access density" based frame reordering, mainly because it has a much smaller variety of addressing modes, and the fact that all instructions are 4 bytes so each frame object is either in range of an instruction (and then the access is "free") or not (and that has a code size cost of 4 bytes). This change improves Memory Tagging codegen by * Placing an object that has been chosen as the base tagged pointer of the function at SP + 0. This saves one instruction to setup the pointer (IRG does not have an offset immediate), and more because that object can now be referenced without materializing its tagged address in a scratch register. * Placing objects that go out of scope simultaneously together. This exposes opportunities for instruction merging in tryMergeAdjacentSTG. Differential Revision: https://reviews.llvm.org/D72366
1 parent 2f63e57 commit 2e794a4

File tree

4 files changed

+239
-1
lines changed

4 files changed

+239
-1
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+163
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ static cl::opt<bool> StackTaggingMergeSetTag(
176176
cl::desc("merge settag instruction in function epilog"), cl::init(true),
177177
cl::Hidden);
178178

179+
static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
180+
cl::desc("sort stack allocations"),
181+
cl::init(true), cl::Hidden);
182+
179183
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
180184

181185
/// Returns the argument pop size.
@@ -3297,3 +3301,162 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
32973301
return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
32983302
getStackAlign());
32993303
}
3304+
3305+
namespace {
3306+
struct FrameObject {
3307+
bool IsValid = false;
3308+
// Index of the object in MFI.
3309+
int ObjectIndex = 0;
3310+
// Group ID this object belongs to.
3311+
int GroupIndex = -1;
3312+
// This object should be placed first (closest to SP).
3313+
bool ObjectFirst = false;
3314+
// This object's group (which always contains the object with
3315+
// ObjectFirst==true) should be placed first.
3316+
bool GroupFirst = false;
3317+
};
3318+
3319+
class GroupBuilder {
3320+
SmallVector<int, 8> CurrentMembers;
3321+
int NextGroupIndex = 0;
3322+
std::vector<FrameObject> &Objects;
3323+
3324+
public:
3325+
GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
3326+
void AddMember(int Index) { CurrentMembers.push_back(Index); }
3327+
void EndCurrentGroup() {
3328+
if (CurrentMembers.size() > 1) {
3329+
// Create a new group with the current member list. This might remove them
3330+
// from their pre-existing groups. That's OK, dealing with overlapping
3331+
// groups is too hard and unlikely to make a difference.
3332+
LLVM_DEBUG(dbgs() << "group:");
3333+
for (int Index : CurrentMembers) {
3334+
Objects[Index].GroupIndex = NextGroupIndex;
3335+
LLVM_DEBUG(dbgs() << " " << Index);
3336+
}
3337+
LLVM_DEBUG(dbgs() << "\n");
3338+
NextGroupIndex++;
3339+
}
3340+
CurrentMembers.clear();
3341+
}
3342+
};
3343+
3344+
bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3345+
// Objects at a lower index are closer to FP; objects at a higher index are
3346+
// closer to SP.
3347+
//
3348+
// For consistency in our comparison, all invalid objects are placed
3349+
// at the end. This also allows us to stop walking when we hit the
3350+
// first invalid item after it's all sorted.
3351+
//
3352+
// The "first" object goes first (closest to SP), followed by the members of
3353+
// the "first" group.
3354+
//
3355+
// The rest are sorted by the group index to keep the groups together.
3356+
// Higher numbered groups are more likely to be around longer (i.e. untagged
3357+
// in the function epilogue and not at some earlier point). Place them closer
3358+
// to SP.
3359+
//
3360+
// If all else equal, sort by the object index to keep the objects in the
3361+
// original order.
3362+
return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3363+
A.ObjectIndex) <
3364+
std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3365+
B.ObjectIndex);
3366+
}
3367+
} // namespace
3368+
3369+
void AArch64FrameLowering::orderFrameObjects(
3370+
const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3371+
if (!OrderFrameObjects || ObjectsToAllocate.empty())
3372+
return;
3373+
3374+
const MachineFrameInfo &MFI = MF.getFrameInfo();
3375+
std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3376+
for (auto &Obj : ObjectsToAllocate) {
3377+
FrameObjects[Obj].IsValid = true;
3378+
FrameObjects[Obj].ObjectIndex = Obj;
3379+
}
3380+
3381+
// Identify stack slots that are tagged at the same time.
3382+
GroupBuilder GB(FrameObjects);
3383+
for (auto &MBB : MF) {
3384+
for (auto &MI : MBB) {
3385+
if (MI.isDebugInstr())
3386+
continue;
3387+
int OpIndex;
3388+
switch (MI.getOpcode()) {
3389+
case AArch64::STGloop:
3390+
case AArch64::STZGloop:
3391+
OpIndex = 3;
3392+
break;
3393+
case AArch64::STGOffset:
3394+
case AArch64::STZGOffset:
3395+
case AArch64::ST2GOffset:
3396+
case AArch64::STZ2GOffset:
3397+
OpIndex = 1;
3398+
break;
3399+
default:
3400+
OpIndex = -1;
3401+
}
3402+
3403+
int TaggedFI = -1;
3404+
if (OpIndex >= 0) {
3405+
const MachineOperand &MO = MI.getOperand(OpIndex);
3406+
if (MO.isFI()) {
3407+
int FI = MO.getIndex();
3408+
if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3409+
FrameObjects[FI].IsValid)
3410+
TaggedFI = FI;
3411+
}
3412+
}
3413+
3414+
// If this is a stack tagging instruction for a slot that is not part of a
3415+
// group yet, either start a new group or add it to the current one.
3416+
if (TaggedFI >= 0)
3417+
GB.AddMember(TaggedFI);
3418+
else
3419+
GB.EndCurrentGroup();
3420+
}
3421+
// Groups should never span multiple basic blocks.
3422+
GB.EndCurrentGroup();
3423+
}
3424+
3425+
// If the function's tagged base pointer is pinned to a stack slot, we want to
3426+
// put that slot first when possible. This will likely place it at SP + 0,
3427+
// and save one instruction when generating the base pointer because IRG does
3428+
// not allow an immediate offset.
3429+
const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3430+
Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3431+
if (TBPI) {
3432+
FrameObjects[*TBPI].ObjectFirst = true;
3433+
FrameObjects[*TBPI].GroupFirst = true;
3434+
int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3435+
if (FirstGroupIndex >= 0)
3436+
for (FrameObject &Object : FrameObjects)
3437+
if (Object.GroupIndex == FirstGroupIndex)
3438+
Object.GroupFirst = true;
3439+
}
3440+
3441+
llvm::stable_sort(FrameObjects, FrameObjectCompare);
3442+
3443+
int i = 0;
3444+
for (auto &Obj : FrameObjects) {
3445+
// All invalid items are sorted at the end, so it's safe to stop.
3446+
if (!Obj.IsValid)
3447+
break;
3448+
ObjectsToAllocate[i++] = Obj.ObjectIndex;
3449+
}
3450+
3451+
LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3452+
: FrameObjects) {
3453+
if (!Obj.IsValid)
3454+
break;
3455+
dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3456+
if (Obj.ObjectFirst)
3457+
dbgs() << ", first";
3458+
if (Obj.GroupFirst)
3459+
dbgs() << ", group-first";
3460+
dbgs() << "\n";
3461+
});
3462+
}

llvm/lib/Target/AArch64/AArch64FrameLowering.h

+4
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ class AArch64FrameLowering : public TargetFrameLowering {
118118
return StackId != TargetStackID::SVEVector;
119119
}
120120

121+
void
122+
orderFrameObjects(const MachineFunction &MF,
123+
SmallVectorImpl<int> &ObjectsToAllocate) const override;
124+
121125
private:
122126
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
123127
uint64_t StackBumpBytes) const;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -aarch64-order-frame-objects=1 | FileCheck %s
2+
3+
declare void @use(i8* %p)
4+
declare void @llvm.aarch64.settag(i8* %p, i64 %a)
5+
declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a)
6+
7+
; Two loops of size 256; the second loop updates SP.
8+
; After frame reordering, two loops can be merged into one.
9+
define void @stg128_128_gap_128_128() {
10+
entry:
11+
; CHECK-LABEL: stg128_128_gap_128_128:
12+
; CHECK: mov x8, #512
13+
; CHECK: st2g sp, [sp], #32
14+
; CHECK: sub x8, x8, #32
15+
; CHECK: cbnz x8,
16+
; CHECK: ret
17+
%a = alloca i8, i32 128, align 16
18+
%a2 = alloca i8, i32 128, align 16
19+
%b = alloca i8, i32 32, align 16
20+
%c = alloca i8, i32 128, align 16
21+
%c2 = alloca i8, i32 128, align 16
22+
call void @use(i8* %b)
23+
call void @llvm.aarch64.settag(i8* %a, i64 128)
24+
call void @llvm.aarch64.settag(i8* %a2, i64 128)
25+
call void @llvm.aarch64.settag(i8* %c, i64 128)
26+
call void @llvm.aarch64.settag(i8* %c2, i64 128)
27+
ret void
28+
}
29+
30+
define void @stg2(i1 %flag) {
31+
entry:
32+
; CHECK-LABEL: stg2:
33+
%a = alloca i8, i32 160, align 16
34+
%a2 = alloca i8, i32 160, align 16
35+
%b = alloca i8, i32 32, align 16
36+
%c = alloca i8, i32 128, align 16
37+
%c2 = alloca i8, i32 128, align 16
38+
call void @use(i8* %b)
39+
br i1 %flag, label %if.then, label %if.else
40+
41+
if.then:
42+
; CHECK: mov x8, #320
43+
; CHECK: st2g x9, [x9], #32
44+
; CHECK: sub x8, x8, #32
45+
; CHECK: cbnz x8,
46+
call void @llvm.aarch64.settag(i8* %a, i64 160)
47+
call void @llvm.aarch64.settag(i8* %a2, i64 160)
48+
br label %if.end
49+
50+
if.else:
51+
; CHECK: mov x8, #256
52+
; CHECK: st2g x9, [x9], #32
53+
; CHECK: sub x8, x8, #32
54+
; CHECK: cbnz x8,
55+
call void @llvm.aarch64.settag(i8* %c, i64 128)
56+
call void @llvm.aarch64.settag(i8* %c2, i64 128)
57+
br label %if.end
58+
59+
if.end:
60+
; CHECK: mov x8, #576
61+
; CHECK: st2g sp, [sp], #32
62+
; CHECK: sub x8, x8, #32
63+
; CHECK: cbnz x8,
64+
call void @llvm.aarch64.settag(i8* %a, i64 160)
65+
call void @llvm.aarch64.settag(i8* %a2, i64 160)
66+
call void @llvm.aarch64.settag(i8* %c, i64 128)
67+
call void @llvm.aarch64.settag(i8* %c2, i64 128)
68+
69+
; CHECK: ret
70+
ret void
71+
}

llvm/test/CodeGen/AArch64/settag-merge.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s
1+
; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -aarch64-order-frame-objects=0 | FileCheck %s
22

33
declare void @use(i8* %p)
44
declare void @llvm.aarch64.settag(i8* %p, i64 %a)

0 commit comments

Comments
 (0)