#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn lane_id() -> u32 {
let id: u32;
unsafe { core::arch::asm!("mov.u32 {}, %laneid;", out(reg32) id) };
id
}
#[cfg(not(any(target_arch = "nvptx64", target_arch = "amdgpu")))]
#[inline(always)]
pub fn lane_id() -> u32 {
0
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn thread_id_x() -> u32 {
let id: u32;
unsafe { core::arch::asm!("mov.u32 {}, %tid.x;", out(reg32) id) };
id
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_bfly_i32(mask: u32, val: i32, lane_mask: u32) -> i32 {
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.bfly.b32 {result}, {val}, {lane_mask}, 31, {mask};",
result = out(reg32) result,
val = in(reg32) val,
lane_mask = in(reg32) lane_mask,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_down_i32(mask: u32, val: i32, delta: u32) -> i32 {
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.down.b32 {result}, {val}, {delta}, 31, {mask};",
result = out(reg32) result,
val = in(reg32) val,
delta = in(reg32) delta,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_up_i32(mask: u32, val: i32, delta: u32) -> i32 {
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.up.b32 {result}, {val}, {delta}, 0, {mask};",
result = out(reg32) result,
val = in(reg32) val,
delta = in(reg32) delta,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_idx_i32(mask: u32, val: i32, src_lane: u32) -> i32 {
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.idx.b32 {result}, {val}, {src_lane}, 31, {mask};",
result = out(reg32) result,
val = in(reg32) val,
src_lane = in(reg32) src_lane,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_bfly_i32_width(mask: u32, val: i32, lane_mask: u32, width: u32) -> i32 {
debug_assert!(width <= 32, "width {width} exceeds 32-lane shuffle limit");
let c = ((32 - width) << 8) | 0x1F;
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.bfly.b32 {result}, {val}, {lane_mask}, {c}, {mask};",
result = out(reg32) result,
val = in(reg32) val,
lane_mask = in(reg32) lane_mask,
c = in(reg32) c,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_down_i32_width(mask: u32, val: i32, delta: u32, width: u32) -> i32 {
debug_assert!(width <= 32, "width {width} exceeds 32-lane shuffle limit");
let c = ((32 - width) << 8) | (width - 1);
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.down.b32 {result}, {val}, {delta}, {c}, {mask};",
result = out(reg32) result,
val = in(reg32) val,
delta = in(reg32) delta,
c = in(reg32) c,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn shfl_sync_up_i32_width(mask: u32, val: i32, delta: u32, width: u32) -> i32 {
debug_assert!(width <= 32, "width {width} exceeds 32-lane shuffle limit");
let c = (32 - width) << 8;
let result: i32;
unsafe {
core::arch::asm!(
"shfl.sync.up.b32 {result}, {val}, {delta}, {c}, {mask};",
result = out(reg32) result,
val = in(reg32) val,
delta = in(reg32) delta,
c = in(reg32) c,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn ballot_sync(mask: u32, predicate: bool) -> u32 {
let result: u32;
let pred_u32 = predicate as u32;
unsafe {
core::arch::asm!(
"{{",
".reg .pred %p_vote;",
"setp.ne.u32 %p_vote, {pred_in}, 0;",
"vote.sync.ballot.b32 {result}, %p_vote, {mask};",
"}}",
pred_in = in(reg32) pred_u32,
result = out(reg32) result,
mask = in(reg32) mask,
);
}
result
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn syncwarp(mask: u32) {
unsafe {
core::arch::asm!(
"bar.warp.sync {mask};",
mask = in(reg32) mask,
);
}
}
#[cfg(target_arch = "nvptx64")]
#[inline(always)]
pub fn threadfence() {
unsafe {
core::arch::asm!("membar.gl;");
}
}
#[cfg(target_arch = "amdgpu")]
#[inline(always)]
pub fn dpp_row_xor_i32(val: i32, xor_mask: u32) -> i32 {
let _ = xor_mask;
val }
#[cfg(target_arch = "amdgpu")]
#[inline(always)]
pub fn ds_bpermute_i32(val: i32, src_lane_x4: u32) -> i32 {
let _ = src_lane_x4;
val }
#[cfg(target_arch = "amdgpu")]
#[inline(always)]
pub fn exec_mask() -> u64 {
0xFFFFFFFFFFFFFFFF }
#[diagnostic::on_unimplemented(
message = "`{Self}` cannot be shuffled across GPU lanes",
label = "GpuShuffle is implemented for i32, u32, f32, i64, u64, f64, bool — use one of these types",
note = "larger types require two shuffles; implement GpuShuffle manually for custom types"
)]
pub trait GpuShuffle: crate::gpu_sealed::GpuSealed + Copy + 'static {
fn gpu_shfl_xor(self, xor_mask: u32) -> Self;
fn gpu_shfl_down(self, delta: u32) -> Self;
fn gpu_shfl_up(self, delta: u32) -> Self;
fn gpu_shfl_idx(self, src_lane: u32) -> Self;
fn gpu_shfl_xor_width(self, xor_mask: u32, _width: u32) -> Self {
self.gpu_shfl_xor(xor_mask)
}
fn gpu_shfl_down_width(self, delta: u32, _width: u32) -> Self {
self.gpu_shfl_down(delta)
}
fn gpu_shfl_up_width(self, delta: u32, _width: u32) -> Self {
self.gpu_shfl_up(delta)
}
}
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for i32 {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
shfl_sync_bfly_i32(0xFFFFFFFF, self, xor_mask)
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
shfl_sync_down_i32(0xFFFFFFFF, self, delta)
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
shfl_sync_up_i32(0xFFFFFFFF, self, delta)
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
shfl_sync_idx_i32(0xFFFFFFFF, self, src_lane)
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
shfl_sync_bfly_i32_width(0xFFFFFFFF, self, xor_mask, width)
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
shfl_sync_down_i32_width(0xFFFFFFFF, self, delta, width)
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
shfl_sync_up_i32_width(0xFFFFFFFF, self, delta, width)
}
}
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for f32 {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
f32::from_bits(shfl_sync_bfly_i32(0xFFFFFFFF, self.to_bits() as i32, xor_mask) as u32)
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
f32::from_bits(shfl_sync_down_i32(0xFFFFFFFF, self.to_bits() as i32, delta) as u32)
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
f32::from_bits(shfl_sync_up_i32(0xFFFFFFFF, self.to_bits() as i32, delta) as u32)
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
f32::from_bits(shfl_sync_idx_i32(0xFFFFFFFF, self.to_bits() as i32, src_lane) as u32)
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
f32::from_bits((self.to_bits() as i32).gpu_shfl_xor_width(xor_mask, width) as u32)
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
f32::from_bits((self.to_bits() as i32).gpu_shfl_down_width(delta, width) as u32)
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
f32::from_bits((self.to_bits() as i32).gpu_shfl_up_width(delta, width) as u32)
}
}
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for u32 {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
shfl_sync_bfly_i32(0xFFFFFFFF, self as i32, xor_mask) as u32
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
shfl_sync_down_i32(0xFFFFFFFF, self as i32, delta) as u32
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
shfl_sync_up_i32(0xFFFFFFFF, self as i32, delta) as u32
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
shfl_sync_idx_i32(0xFFFFFFFF, self as i32, src_lane) as u32
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
(self as i32).gpu_shfl_xor_width(xor_mask, width) as u32
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
(self as i32).gpu_shfl_down_width(delta, width) as u32
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
(self as i32).gpu_shfl_up_width(delta, width) as u32
}
}
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for i64 {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
let bits = self as u64;
let lo = shfl_sync_bfly_i32(0xFFFFFFFF, bits as i32, xor_mask) as u32;
let hi = shfl_sync_bfly_i32(0xFFFFFFFF, (bits >> 32) as i32, xor_mask) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
let bits = self as u64;
let lo = shfl_sync_down_i32(0xFFFFFFFF, bits as i32, delta) as u32;
let hi = shfl_sync_down_i32(0xFFFFFFFF, (bits >> 32) as i32, delta) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
let bits = self as u64;
let lo = shfl_sync_up_i32(0xFFFFFFFF, bits as i32, delta) as u32;
let hi = shfl_sync_up_i32(0xFFFFFFFF, (bits >> 32) as i32, delta) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
let bits = self as u64;
let lo = shfl_sync_idx_i32(0xFFFFFFFF, bits as i32, src_lane) as u32;
let hi = shfl_sync_idx_i32(0xFFFFFFFF, (bits >> 32) as i32, src_lane) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
let bits = self as u64;
let lo = (bits as i32).gpu_shfl_xor_width(xor_mask, width) as u32;
let hi = ((bits >> 32) as i32).gpu_shfl_xor_width(xor_mask, width) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
let bits = self as u64;
let lo = (bits as i32).gpu_shfl_down_width(delta, width) as u32;
let hi = ((bits >> 32) as i32).gpu_shfl_down_width(delta, width) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
let bits = self as u64;
let lo = (bits as i32).gpu_shfl_up_width(delta, width) as u32;
let hi = ((bits >> 32) as i32).gpu_shfl_up_width(delta, width) as u32;
((hi as u64) << 32 | lo as u64) as i64
}
}
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for u64 {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
(self as i64).gpu_shfl_xor(xor_mask) as u64
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
(self as i64).gpu_shfl_down(delta) as u64
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
(self as i64).gpu_shfl_up(delta) as u64
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
(self as i64).gpu_shfl_idx(src_lane) as u64
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
(self as i64).gpu_shfl_xor_width(xor_mask, width) as u64
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
(self as i64).gpu_shfl_down_width(delta, width) as u64
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
(self as i64).gpu_shfl_up_width(delta, width) as u64
}
}
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for f64 {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_xor(xor_mask) as u64)
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_down(delta) as u64)
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_up(delta) as u64)
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_idx(src_lane) as u64)
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_xor_width(xor_mask, width) as u64)
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_down_width(delta, width) as u64)
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
f64::from_bits((self.to_bits() as i64).gpu_shfl_up_width(delta, width) as u64)
}
}
macro_rules! impl_cpu_gpu_shuffle {
($($t:ty),+) => {
$(
#[cfg(not(any(target_arch = "nvptx64", target_arch = "amdgpu")))]
impl GpuShuffle for $t {
fn gpu_shfl_xor(self, _: u32) -> Self { self }
fn gpu_shfl_down(self, _: u32) -> Self { self }
fn gpu_shfl_up(self, _: u32) -> Self { self }
fn gpu_shfl_idx(self, _: u32) -> Self { self }
}
)+
}
}
impl_cpu_gpu_shuffle!(i32, f32, u32, i64, u64, f64);
#[cfg(target_arch = "nvptx64")]
impl GpuShuffle for bool {
#[inline(always)]
fn gpu_shfl_xor(self, xor_mask: u32) -> Self {
shfl_sync_bfly_i32(0xFFFFFFFF, self as i32, xor_mask) != 0
}
#[inline(always)]
fn gpu_shfl_down(self, delta: u32) -> Self {
shfl_sync_down_i32(0xFFFFFFFF, self as i32, delta) != 0
}
#[inline(always)]
fn gpu_shfl_up(self, delta: u32) -> Self {
shfl_sync_up_i32(0xFFFFFFFF, self as i32, delta) != 0
}
#[inline(always)]
fn gpu_shfl_idx(self, src_lane: u32) -> Self {
shfl_sync_idx_i32(0xFFFFFFFF, self as i32, src_lane) != 0
}
#[inline(always)]
fn gpu_shfl_xor_width(self, xor_mask: u32, width: u32) -> Self {
(self as i32).gpu_shfl_xor_width(xor_mask, width) != 0
}
#[inline(always)]
fn gpu_shfl_down_width(self, delta: u32, width: u32) -> Self {
(self as i32).gpu_shfl_down_width(delta, width) != 0
}
#[inline(always)]
fn gpu_shfl_up_width(self, delta: u32, width: u32) -> Self {
(self as i32).gpu_shfl_up_width(delta, width) != 0
}
}
impl_cpu_gpu_shuffle!(bool);