[AMDGPU] Enable divergence predicates for ctlz/cttz

alex-t · alex-t · commit 19727e31fb2c · 2021-12-20T20:53:48.000+03:00
ctlz/cttz get lowered to the set of target opcodes This change enables the ISel to select SALU or VALU form according to the SDNode divergence. CTLZ - S_FLBIT_I32_B32 if uniform and V_FFBH_U32_e64 if divergent CTTZ - S_FF1_I32_B32 if uniform and V_FFBL_B32_e64 if divergent Also @llvm.amdgcn.sffbh.i32 gets lowered to S_FLBIT_I32 if uniform and V_FFBH_I32_e64 if divergent NOTE: 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B64 are not currently supported by the DAG ISel. ctlz/cttz with i64 input are split into two 32bit instructions. Nevertheless, they already have the patterns and were equipped with the divergence predicates to make sure they will be selected correctly when enabled. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D116044
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -257,22 +257,22 @@ let isReMaterializable = 1 in {
 def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
 def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
 def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
 >;
 
 def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
 >;
 def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
-  [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
 >;
 def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
 def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: name:            s_ctlz_i32
+; GCN: S_FLBIT_I32_B32
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+; GCN-LABEL: name:            v_ctlz_i32
+; GCN: V_FFBH_U32_e64
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            s_cttz_i32
+; GCN: S_FF1_I32_B32
+define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {	
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            v_cttz_i32
+; GCN: V_FFBL_B32_e64
+define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            s_flbit
+; GCN: S_FLBIT_I32
+define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+  %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            v_flbit
+; GCN: V_FFBH_I32_e64
+define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.amdgcn.sffbh.i32(i32)
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+