MIR passes do not take into account if an operation is convergent #137086

kulst · 2025-02-15T16:46:01Z

Issue

I encountered an issue when using the core::arch::nvptx::_syncthreads() while the MIR pass JumpThreading is enabled. The issue can be reproduced with a simple kernel when executed with the following parameters:

block_dim = BlockDim{x : 512, y : 1, z : 1};
grid_dim = GridDim{x : 2, y : 1, z : 1};
n = 1000;

#[no_mangle]
pub unsafe extern "ptx-kernel" fn memcpy(input: *const f32, output: *mut f32, n: u32) {
    let n = n as usize;
    // thread id x in grid
    let gtid_x = (_block_dim_x() * _block_idx_x() + _thread_idx_x()) as usize;
    // read input
    let temp = if gtid_x < n {
        input.add(gtid_x).read()
    } else {
        0f32
    };
    _syncthreads();
    // write output
    if gtid_x < n {
        output.add(gtid_x).write(temp);
    }
}

compute-sanitizer --tool synccheck complains about barrier errors. The resulting .ptx shows the reason for that. The code is transformed to the following:

...
if gtid_x < n {
    let temp = input.add(gtid_x).read();
    _syncthreads();
    output.add(gtid_x).write(temp);
} else {
    _syncthreads();
}
...

I could track down this transformation to the MIR pass JumpThreading. However, _syncthreads() is a convergent operation. This property must be considered when doing code transformations (see this LLVM issue for reference).

Therefore turning off the MIR pass JumpThreading completely prevents this transformation from happening and the resulting code is correct (compute-sanitizer does also not complain any longer).

PTX with JumpThreading

//
// Generated by LLVM NVPTX Back-End
//

.version 5.0
.target sm_61
.address_size 64

	// .globl	memcpy                  // -- Begin function memcpy
                                        // @memcpy
.visible .entry memcpy(
	.param .u64 memcpy_param_0,
	.param .u64 memcpy_param_1,
	.param .u32 memcpy_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<5>;
	.reg .f32 	%f<2>;
	.reg .b64 	%rd<10>;

// %bb.0:
	ld.param.u32 	%rd6, [memcpy_param_2];
	mov.u32 	%r1, %ntid.x;
	mov.u32 	%r2, %ctaid.x;
	mov.u32 	%r3, %tid.x;
	mad.lo.s32 	%r4, %r1, %r2, %r3;
	cvt.s64.s32 	%rd3, %r4;
	setp.lt.u64 	%p1, %rd3, %rd6;
	@%p1 bra 	$L__BB0_2;
	bra.uni 	$L__BB0_1;
$L__BB0_2:
	ld.param.u64 	%rd4, [memcpy_param_0];
	ld.param.u64 	%rd5, [memcpy_param_1];
	cvta.to.global.u64 	%rd1, %rd5;
	cvta.to.global.u64 	%rd2, %rd4;
	shl.b64 	%rd7, %rd3, 2;
	add.s64 	%rd8, %rd2, %rd7;
	ld.global.f32 	%f1, [%rd8];
	bar.sync 	0;
	add.s64 	%rd9, %rd1, %rd7;
	st.global.f32 	[%rd9], %f1;
	bra.uni 	$L__BB0_3;
$L__BB0_1:
	bar.sync 	0;
$L__BB0_3:
	ret;
                                        // -- End function
}

PTX without JumpThreading

//
// Generated by LLVM NVPTX Back-End
//

.version 5.0
.target sm_61
.address_size 64

	// .globl	memcpy                  // -- Begin function memcpy
                                        // @memcpy
.visible .entry memcpy(
	.param .u64 memcpy_param_0,
	.param .u64 memcpy_param_1,
	.param .u32 memcpy_param_2
)
{
	.reg .pred 	%p<3>;
	.reg .b32 	%r<5>;
	.reg .f32 	%f<5>;
	.reg .b64 	%rd<11>;

// %bb.0:
	ld.param.u32 	%rd2, [memcpy_param_2];
	mov.u32 	%r1, %ntid.x;
	mov.u32 	%r2, %ctaid.x;
	mov.u32 	%r3, %tid.x;
	mad.lo.s32 	%r4, %r1, %r2, %r3;
	cvt.s64.s32 	%rd3, %r4;
	setp.ge.u64 	%p1, %rd3, %rd2;
	mov.f32 	%f4, 0f00000000;
	@%p1 bra 	$L__BB0_2;
// %bb.1:
	ld.param.u64 	%rd6, [memcpy_param_0];
	cvta.to.global.u64 	%rd8, %rd6;
	mul.wide.s32 	%rd9, %r4, 4;
	add.s64 	%rd4, %rd8, %rd9;
	ld.global.f32 	%f4, [%rd4];
$L__BB0_2:
	setp.lt.u64 	%p2, %rd3, %rd2;
	bar.sync 	0;
	@%p2 bra 	$L__BB0_4;
	bra.uni 	$L__BB0_3;
$L__BB0_4:
	ld.param.u64 	%rd7, [memcpy_param_1];
	cvta.to.global.u64 	%rd1, %rd7;
	shl.b64 	%rd10, %rd3, 2;
	add.s64 	%rd5, %rd1, %rd10;
	st.global.f32 	[%rd5], %f4;
$L__BB0_3:
	ret;
                                        // -- End function
}

In the above example _syncthreads() does nothing useful and can be ommited. However I encountered this issue in a more complex stencil kernel where these transformations lead to side effects and race conditions.

Compiler arguments

With JumpThreading:

cargo +nightly-2025-02-14 rustc --release -- -C target-cpu=sm_61 -Clinker-flavor=llbc -Zunstable-options

Without JumpThreading:

cargo +nightly-2025-02-14 rustc --release -- -C target-cpu=sm_61 -Clinker-flavor=llbc -Zunstable-options -Zmir-enable-passes=-JumpThreading

Background

Targets like nvptx64-nvidia-cuda, amdgpu and probably also spir-v (rust-gpu) make use of so called convergent operations (like _syncthreads(). LLVM provides a detailed explanation for this type of operations. Special care must be taken when code that involves convergent operations is transformed.
To my knowledge rustc does not know if an operation is convergent so passes do not handle these operations correctly.

Zulip-Stream

The text was updated successfully, but these errors were encountered:

Noratrieb · 2025-02-15T16:57:55Z

@rust-lang/wg-mir-opt and @rust-lang/opsem might be interested as well

RalfJung · 2025-02-15T17:05:22Z

Seems like that's a pretty cursed intrinsic, and should not be exposed as a regular Rust intrinsic. This is another case of hardware vendors thinking they can just add something to C or Rust and say it works "like this hardware instruction"... but that's not how this goes, hardware vendors need to work with language designers to find a way to add those semantics into the language.

hanna-kruppe · 2025-02-15T17:31:18Z

Can this be worked around by modeling the intrinsic as a call to an arbitrary external function? In other words: is the “convergent” concept in LLVM only needed to preserve more optimizations around uses of these operations, or is it even more special (language-spec-breaking) than that?

bjorn3 · 2025-02-15T17:36:36Z

This is another case of hardware vendors thinking they can just add something to C or Rust and say it works "like this hardware instruction"... but that's not how this goes, hardware vendors need to work with language designers to find a way to add those semantics into the language.

GPU vendors did not add this to C initially. Instead when Nvidia introduced programmable shader units, they and Microsoft created a C like language (in I think 2001) which they called Cg (C for graphics) and HLSL (high level shading language) respectively (according to https://gamedev.net/forums/topic/281349-hlsl-vs-cg-any-significant-difference/2759856/ these two languages are nearly identical) It has only been much later that real C and C++ got supported on GPU's (CUDA released in 2007, OpenCL in 2009 which used C++ and C respectively as basis for their programming language).

bjorn3 · 2025-02-15T17:42:18Z

Can this be worked around by modeling the intrinsic as a call to an arbitrary external function? In other words: is the “convergent” concept in LLVM only needed to preserve more optimizations around uses of these operations, or is it even more special (language-spec-breaking) than that?

Using an extern function call would allow the call to be duplicated into two branches. The convergence requirement means that every single thread within a thread group has to reach the exact same instruction. Some GPU's implement threads within a thread group by basically having a single execution stream and using a separate vector lane for each "thread" and just masking/unmasking lanes depending on which branch of an if those threads took. Instructions that require convergence don't respect this masking. And for other GPU's I guess it would cause issues like deadlocks and other misbehavior.

nikic · 2025-02-15T17:59:59Z

I think as far as Rust is concerned, it should be good enough to have a target flag that says "this target may have convergent operations" and disable certain MIR optimizations like JumpThreading entirely in that case. The necessary complexity to correctly perform control flow optimizations in the presence of convergent operations is not worth the bother at Rust's level.

RalfJung · 2025-02-15T18:48:02Z

So a convergent call that's present transitively anywhere inside a function I call means that call has to be treated specially? How can LLVM even handle this soundly, e.g. when function pointers are involved?

nikic · 2025-02-15T19:55:16Z

So a convergent call that's present transitively anywhere inside a function I call means that call has to be treated specially? How can LLVM even handle this soundly, e.g. when function pointers are involved?

Clang marks all functions and calls as convergent by default when targeting certain languages or targets (like CUDA/OpenCL and AMDGPU/NVPTX). LLVM will then try to infer non-convergent functions. (This is inverted from how attributes usually work.)

Rust should do that as well.

saethlin · 2025-02-15T20:26:30Z

In the short term, we should dull the pain by adding a hacky check in the MirPass::is_enabled impl for JumpThreading, just like how we disabled alignment checks on i686-pc-windows-msvc. Such a change does not resolve this issue, but we should do it anyway.

What has me really confused is that we're only hearing about this problem now. JumpThreading was enabled 6 stable releases ago. Have GPU users been blanket disabling all MIR opts since before 1.78 or something?

kulst · 2025-02-15T21:13:26Z

In my research the bug manifested only in a very specific case. It really was a coincidence I encountered it.

@kjetilkjeka as one of the more frequent users did you encounter something like this in the past?
@jedbrown as you were reasoning about convergent operations in the zulip I wanted to make you aware of this issue.

kjetilkjeka · 2025-03-03T02:55:34Z

@kjetilkjeka as one of the more frequent users did you encounter something like this in the past?

Not really. Our GPU code that uses thread syncing also uses other features that are not straightforward in Rust and is thus written in C. I'm sure if things like shared memory was available in Rust we would have a lot of cases where this broke for us. Our Rust GPU code almost always read an input buffer, perform some calculation, write to a seperate output buffer.

kulst · 2025-03-18T22:02:24Z

I am currently working on a fix which does what @nikic suggested:

Adding a flag to TargetOptions which indicates if a target has convergent operations (this flag is true for nvptx64 and amdgpu)
Disabling JumpThreading for targets where this flag is set
Adding the LLVM convergent attribute to all functions, function calls and inline asm calls (mimicking the behavior of clang)

branch

This should initially reduce the pain. However, a few things are still unclear to me:

Are there any other optimizations in rustc that must be disabled?
As far as I know, uncontrolled convergent semantics in LLVM are underspecified. Because of this there is an ongoing effort in LLVM for controlled convergence (controlled usage of convergence control tokens). I am unsure if this underspecification can produce unsound code.

It may be an option to use convergence control tokens as soon as these are stable in LLVM (to my knowledge at the moment clang only emits these for HLSL/SPIR-V. There is active development ongoing to respect convergence control tokens in LLVM's passes and targets.)

saethlin · 2025-03-18T22:08:18Z

Are there any other optimizations in rustc that must be disabled?

Is there a good reason to leave any enabled?

kulst added the C-bug Category: This is a bug. label Feb 15, 2025

rustbot added the needs-triage This issue may need triage. Remove it if it has been sufficiently triaged. label Feb 15, 2025

rustbot added the I-prioritize Issue: Indicates that prioritization has been requested for this issue. label Feb 15, 2025

jedbrown mentioned this issue Mar 19, 2025

add nvptx_target_feature #138689

Open

Flakebi mentioned this issue Mar 25, 2025

Tracking Issue for amdgcn target #135024

Open

21 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

MIR passes do not take into account if an operation is convergent #137086

MIR passes do not take into account if an operation is convergent #137086

kulst commented Feb 15, 2025

Noratrieb commented Feb 15, 2025

RalfJung commented Feb 15, 2025 •

edited

Loading

hanna-kruppe commented Feb 15, 2025

bjorn3 commented Feb 15, 2025

bjorn3 commented Feb 15, 2025

nikic commented Feb 15, 2025

RalfJung commented Feb 15, 2025

nikic commented Feb 15, 2025 •

edited

Loading

saethlin commented Feb 15, 2025

kulst commented Feb 15, 2025 •

edited

Loading

kjetilkjeka commented Mar 3, 2025

kulst commented Mar 18, 2025

saethlin commented Mar 18, 2025

MIR passes do not take into account if an operation is convergent #137086

MIR passes do not take into account if an operation is convergent #137086

Comments

kulst commented Feb 15, 2025

Issue

Background

Noratrieb commented Feb 15, 2025

RalfJung commented Feb 15, 2025 • edited Loading

hanna-kruppe commented Feb 15, 2025

bjorn3 commented Feb 15, 2025

bjorn3 commented Feb 15, 2025

nikic commented Feb 15, 2025

RalfJung commented Feb 15, 2025

nikic commented Feb 15, 2025 • edited Loading

saethlin commented Feb 15, 2025

kulst commented Feb 15, 2025 • edited Loading

kjetilkjeka commented Mar 3, 2025

kulst commented Mar 18, 2025

saethlin commented Mar 18, 2025

RalfJung commented Feb 15, 2025 •

edited

Loading

nikic commented Feb 15, 2025 •

edited

Loading

kulst commented Feb 15, 2025 •

edited

Loading