khal_std/sync/
barriers.rs

1/// Workgroup memory barrier with group synchronization.
2///
3/// On GPU: calls `spirv_std::arch::workgroup_memory_barrier_with_group_sync`.
4/// On CPU: waits on the thread-local workgroup barrier (set by CPU dispatch).
5#[inline(always)]
6pub fn workgroup_memory_barrier_with_group_sync() {
7    #[cfg(target_arch = "spirv")]
8    {
9        spirv_std::arch::workgroup_memory_barrier_with_group_sync();
10    }
11    #[cfg(target_arch = "nvptx64")]
12    {
13        // Call the LLVM intrinsic directly instead of cuda_std::thread::sync_threads()
14        // so that LLVM sees the `convergent` attribute during optimization passes.
15        // Without this, LLVM tail-duplicates the barrier into both sides of divergent
16        // branches (if/else), causing threads to hit different bar.sync instructions
17        // and deadlocking the block.
18        // This fixes kernels with barriers that were otherwise hanging when using
19        // cuda_std::thread::sync_thread() instead.
20        unsafe extern "C" {
21            #[link_name = "llvm.nvvm.barrier0"]
22            fn nvvm_barrier0();
23        }
24        unsafe {
25            nvvm_barrier0();
26        }
27        //     cuda_std::thread::sync_threads();
28    }
29
30    #[cfg(not(any(target_arch = "spirv", target_arch = "nvptx64")))]
31    #[cfg(feature = "cpu")]
32    {
33        crate::arch::cpu::barrier_wait();
34    }
35}
36
37/// Control barrier with explicit execution scope, memory scope, and semantics.
38///
39/// On GPU (SPIR-V): calls `spirv_std::arch::control_barrier`.
40/// On GPU (CUDA): calls `__syncthreads()`.
41/// On CPU: waits on the thread-local workgroup barrier.
42#[inline(always)]
43pub fn control_barrier<const EXECUTION: u32, const MEMORY: u32, const SEMANTICS: u32>() {
44    #[cfg(target_arch = "spirv")]
45    {
46        spirv_std::arch::control_barrier::<EXECUTION, MEMORY, SEMANTICS>();
47    }
48    #[cfg(not(target_arch = "spirv"))]
49    {
50        // handle CUDA and CPU backends
51        workgroup_memory_barrier_with_group_sync();
52    }
53}
khal_std/sync/barriers.rs

khal_std/sync/
barriers.rs