From deb2e1022eb1e43ed6656b969e11a7432d814287 Mon Sep 17 00:00:00 2001 From: Snehal Reddy Date: Sun, 19 Apr 2026 14:22:16 +0100 Subject: [PATCH] fix(cuda_std): use correct PTX scope suffix in block acqrel fence This updates the inline assembly to use the correct `.cta` (cooperative thread array) scope suffix, ensuring that block-level fences don't incur the unnecessary performance overhead of a system-wide synchronization. --- crates/cuda_std/src/atomic/intrinsics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/cuda_std/src/atomic/intrinsics.rs b/crates/cuda_std/src/atomic/intrinsics.rs index 1da0c68b..af8eda8d 100644 --- a/crates/cuda_std/src/atomic/intrinsics.rs +++ b/crates/cuda_std/src/atomic/intrinsics.rs @@ -42,7 +42,7 @@ pub unsafe fn fence_acqrel_device() { #[gpu_only] pub unsafe fn fence_acqrel_block() { - asm!("fence.acq_rel.sys;"); + asm!("fence.acq_rel.cta;"); } #[gpu_only]