use cuda_std_macros::gpu_only;
use vek::{Vec2, Vec3};
extern "C" {
fn __nvvm_thread_idx_x() -> u32;
fn __nvvm_thread_idx_y() -> u32;
fn __nvvm_thread_idx_z() -> u32;
fn __nvvm_block_dim_x() -> u32;
fn __nvvm_block_dim_y() -> u32;
fn __nvvm_block_dim_z() -> u32;
fn __nvvm_block_idx_x() -> u32;
fn __nvvm_block_idx_y() -> u32;
fn __nvvm_block_idx_z() -> u32;
fn __nvvm_grid_dim_x() -> u32;
fn __nvvm_grid_dim_y() -> u32;
fn __nvvm_grid_dim_z() -> u32;
fn __nvvm_warp_size() -> u32;
fn __nvvm_block_barrier();
fn __nvvm_grid_fence();
fn __nvvm_device_fence();
fn __nvvm_system_fence();
}
#[cfg(target_os = "cuda")]
macro_rules! inbounds {
($func_name:ident, $bound:expr) => {{
let val = unsafe { $func_name() };
if val > $bound {
unsafe { core::hint::unreachable_unchecked() }
} else {
val
}
}};
($func_name:ident, $lower_bound:expr, $upper_bound:expr) => {{
let val = unsafe { $func_name() };
if val < $lower_bound || val > $upper_bound {
unsafe { core::hint::unreachable_unchecked() }
} else {
val
}
}};
}
#[gpu_only]
#[inline(always)]
pub fn thread_idx_x() -> u32 {
inbounds!(__nvvm_thread_idx_x, 1024)
}
#[gpu_only]
#[inline(always)]
pub fn thread_idx_y() -> u32 {
inbounds!(__nvvm_thread_idx_y, 1024)
}
#[gpu_only]
#[inline(always)]
pub fn thread_idx_z() -> u32 {
inbounds!(__nvvm_thread_idx_z, 64)
}
#[gpu_only]
#[inline(always)]
pub fn block_idx_x() -> u32 {
inbounds!(__nvvm_block_idx_x, 2147483647)
}
#[gpu_only]
#[inline(always)]
pub fn block_idx_y() -> u32 {
inbounds!(__nvvm_block_idx_y, 65535)
}
#[gpu_only]
#[inline(always)]
pub fn block_idx_z() -> u32 {
inbounds!(__nvvm_block_idx_z, 65535)
}
#[gpu_only]
#[inline(always)]
pub fn block_dim_x() -> u32 {
inbounds!(__nvvm_block_dim_x, 1, 1025)
}
#[gpu_only]
#[inline(always)]
pub fn block_dim_y() -> u32 {
inbounds!(__nvvm_block_dim_y, 1, 1025)
}
#[gpu_only]
#[inline(always)]
pub fn block_dim_z() -> u32 {
inbounds!(__nvvm_block_dim_z, 1, 65)
}
#[gpu_only]
#[inline(always)]
pub fn grid_dim_x() -> u32 {
inbounds!(__nvvm_grid_dim_x, 1, 2147483648)
}
#[gpu_only]
#[inline(always)]
pub fn grid_dim_y() -> u32 {
inbounds!(__nvvm_grid_dim_y, 1, 65536)
}
#[gpu_only]
#[inline(always)]
pub fn grid_dim_z() -> u32 {
inbounds!(__nvvm_grid_dim_z, 1, 65536)
}
#[gpu_only]
#[inline(always)]
pub fn thread_idx() -> Vec3<u32> {
unsafe {
Vec3::new(
__nvvm_thread_idx_x(),
__nvvm_thread_idx_y(),
__nvvm_thread_idx_z(),
)
}
}
#[gpu_only]
#[inline(always)]
pub fn block_idx() -> Vec3<u32> {
unsafe {
Vec3::new(
__nvvm_block_idx_x(),
__nvvm_block_idx_y(),
__nvvm_block_idx_z(),
)
}
}
#[gpu_only]
#[inline(always)]
pub fn block_dim() -> Vec3<u32> {
unsafe {
Vec3::new(
__nvvm_block_dim_x(),
__nvvm_block_dim_y(),
__nvvm_block_dim_z(),
)
}
}
#[gpu_only]
#[inline(always)]
pub fn grid_dim() -> Vec3<u32> {
unsafe {
Vec3::new(
__nvvm_grid_dim_x(),
__nvvm_grid_dim_y(),
__nvvm_grid_dim_z(),
)
}
}
#[gpu_only]
#[rustfmt::skip]
#[inline(always)]
pub fn index() -> u32 {
let grid_dim = grid_dim();
let block_idx = block_idx();
let block_dim = block_dim();
let thread_idx = thread_idx();
let block_id = block_idx.x + block_idx.y * grid_dim.x
+ grid_dim.x * grid_dim.y * block_idx.z;
block_id * block_dim.product()
+ (thread_idx.z * (block_dim.x * block_dim.y))
+ (thread_idx.y * block_dim.x) + thread_idx.x
}
#[inline(always)]
pub fn index_1d() -> u32 {
thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32
}
#[inline(always)]
pub fn index_2d() -> Vec2<u32> {
let i = thread_idx_x() + block_idx_x() * block_dim_x();
let j = thread_idx_y() + block_idx_y() * block_dim_y();
Vec2::new(i, j)
}
#[inline(always)]
pub fn index_3d() -> Vec3<u32> {
let i = thread_idx_x() + block_idx_x() * block_dim_x();
let j = thread_idx_y() + block_idx_y() * block_dim_y();
let k = thread_idx_z() + block_idx_z() * block_dim_z();
Vec3::new(i, j, k)
}
#[inline(always)]
pub fn first() -> bool {
block_idx() == Vec3::zero() && thread_idx() == Vec3::zero()
}
#[gpu_only]
#[inline(always)]
pub fn warp_size() -> u32 {
unsafe { __nvvm_warp_size() }
}
#[gpu_only]
#[inline(always)]
pub fn sync_threads() {
unsafe { __nvvm_block_barrier() }
}
#[gpu_only]
#[inline(always)]
pub fn sync_threads_count(predicate: u32) -> u32 {
extern "C" {
#[link_name = "llvm.nvvm.barrier0.popc"]
fn __nvvm_sync_threads_count(predicate: u32) -> u32;
}
unsafe { __nvvm_sync_threads_count(predicate) }
}
#[gpu_only]
#[inline(always)]
pub fn sync_threads_and(predicate: u32) -> u32 {
extern "C" {
#[link_name = "llvm.nvvm.barrier0.and"]
fn __nvvm_sync_threads_and(predicate: u32) -> u32;
}
unsafe { __nvvm_sync_threads_and(predicate) }
}
#[gpu_only]
#[inline(always)]
pub fn sync_threads_or(predicate: u32) -> u32 {
extern "C" {
#[link_name = "llvm.nvvm.barrier0.or"]
fn __nvvm_sync_threads_or(predicate: u32) -> u32;
}
unsafe { __nvvm_sync_threads_or(predicate) }
}
#[gpu_only]
#[inline(always)]
pub fn grid_fence() {
unsafe { __nvvm_grid_fence() }
}
#[gpu_only]
#[inline(always)]
pub fn device_fence() {
unsafe { __nvvm_device_fence() }
}
#[gpu_only]
#[inline(always)]
pub fn system_fence() {
unsafe { __nvvm_system_fence() }
}
#[gpu_only]
#[inline(always)]
pub fn nanosleep(nanos: u32) {
unsafe {
asm!(
"nanosleep {}",
in(reg32) nanos
)
}
}