Skip to content

Commit 8382ce5

Browse files
committed
AMDGPU: Inline constant when materalizing FI with add on gfx9
This was relying on the SGPR usable for the carry out clobber to also be used for the input. There was no carry out on gfx9. With no carry out clobber to worry about, so the literal can just be directly used with a VOP2 add. llvm-svn: 371791
1 parent 4a8916c commit 8382ce5

File tree

4 files changed

+53
-7
lines changed

4 files changed

+53
-7
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -6110,7 +6110,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
61106110
Register DestReg,
61116111
RegScavenger &RS) const {
61126112
if (ST.hasAddNoCarry())
6113-
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
6113+
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
61146114

61156115
Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
61166116
// TODO: Users need to deal with this.

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -1285,12 +1285,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
12851285
.addImm(ST.getWavefrontSizeLog2())
12861286
.addReg(DiffReg, RegState::Kill);
12871287

1288+
const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
1289+
12881290
// TODO: Fold if use instruction is another add of a constant.
1289-
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1291+
if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
12901292
// FIXME: This can fail
12911293
MIB.addImm(Offset);
12921294
MIB.addReg(ScaledReg, RegState::Kill);
1293-
MIB.addImm(0); // clamp bit
1295+
if (!IsVOP2)
1296+
MIB.addImm(0); // clamp bit
12941297
} else {
12951298
Register ConstOffsetReg =
12961299
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);

llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,13 @@ ret:
176176
; Added offset can't be used with VOP3 add
177177
; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
178178
; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
179-
; GCN-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
179+
; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
180180

181181
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
182182
; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]
183183

184184
; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
185-
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]]
185+
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
186186

187187
; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
188188
; GCN: ds_write_b32 v0, [[VZ]]
@@ -200,13 +200,13 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
200200

201201
; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
202202
; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33
203-
; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
203+
; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
204204

205205
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
206206
; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
207207

208208
; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]]
209-
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]]
209+
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
210210

211211
; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
212212
; GCN: ds_write_b32 v0, [[VZ]]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
3+
4+
# Test what happens when an SGPR is unavailable for the unused add. The non-inline constant needs to be folded into the add instruction and not materialized in a register.
5+
6+
---
7+
name: scavenge_sgpr_pei_no_sgprs
8+
tracksRegLiveness: true
9+
10+
stack:
11+
- { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
12+
- { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
13+
14+
machineFunctionInfo:
15+
isEntryFunction: false
16+
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
17+
scratchWaveOffsetReg: $sgpr34
18+
frameOffsetReg: $sgpr33
19+
stackPtrOffsetReg: $sgpr32
20+
21+
body: |
22+
bb.0:
23+
liveins: $vgpr1
24+
25+
; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
26+
; CHECK: liveins: $vgpr1
27+
; CHECK: $sgpr27 = frame-setup COPY $sgpr33
28+
; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
29+
; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
30+
; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
31+
; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
32+
; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
33+
; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr33, implicit $exec
34+
; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
35+
; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
36+
; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
37+
; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
38+
; CHECK: $sgpr33 = frame-setup COPY $sgpr27
39+
; CHECK: S_ENDPGM 0, implicit $vcc
40+
S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
41+
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
42+
S_ENDPGM 0, implicit $vcc
43+
...

0 commit comments

Comments
 (0)