Skip to main content

baracuda_runtime/
query.rs

1//! Runtime-API queries: pointer attributes, device properties, kernel
2//! attributes. Typed wrappers around the `cuda*GetAttributes` /
3//! `cudaGetDeviceProperties` family.
4
5use baracuda_cuda_sys::runtime::runtime;
6use baracuda_cuda_sys::runtime::types::{
7    cudaFuncAttributes, cudaMemoryType, cudaPointerAttributes,
8};
9
10use crate::device::Device;
11use crate::error::{check, Result};
12
13/// Memory kind reported by [`pointer_attributes`].
14#[derive(Copy, Clone, Debug, Eq, PartialEq)]
15pub enum MemoryType {
16    /// Pointer is not registered with CUDA (plain host malloc or an
17    /// unrelated OS allocation).
18    Unregistered,
19    /// Host memory (pinned / mapped / managed-with-host-affinity).
20    Host,
21    /// Plain device memory.
22    Device,
23    /// Managed (unified) memory.
24    Managed,
25}
26
27impl MemoryType {
28    #[inline]
29    fn from_raw(raw: i32) -> Self {
30        match raw {
31            cudaMemoryType::HOST => MemoryType::Host,
32            cudaMemoryType::DEVICE => MemoryType::Device,
33            cudaMemoryType::MANAGED => MemoryType::Managed,
34            _ => MemoryType::Unregistered,
35        }
36    }
37}
38
39/// Typed view over `cudaPointerAttributes`.
40#[derive(Copy, Clone, Debug)]
41pub struct PointerAttributes {
42    pub memory_type: MemoryType,
43    pub device: i32,
44    pub device_pointer: *mut core::ffi::c_void,
45    pub host_pointer: *mut core::ffi::c_void,
46}
47
48/// Query what CUDA knows about `ptr`. On pointers CUDA has never seen
49/// (plain host `malloc`, returned from C libraries) this returns
50/// `MemoryType::Unregistered`.
51///
52/// # Safety
53///
54/// `ptr` can be any pointer — CUDA internally classifies it. This is a
55/// pure query and doesn't dereference `ptr`.
56#[allow(clippy::not_unsafe_ptr_arg_deref)]
57pub fn pointer_attributes(ptr: *const core::ffi::c_void) -> Result<PointerAttributes> {
58    let r = runtime()?;
59    let cu = r.cuda_pointer_get_attributes()?;
60    let mut raw = cudaPointerAttributes::default();
61    check(unsafe {
62        cu(
63            &mut raw as *mut cudaPointerAttributes as *mut core::ffi::c_void,
64            ptr,
65        )
66    })?;
67    Ok(PointerAttributes {
68        memory_type: MemoryType::from_raw(raw.type_),
69        device: raw.device,
70        device_pointer: raw.device_pointer,
71        host_pointer: raw.host_pointer,
72    })
73}
74
75/// Subset of `cudaDeviceProp` fields most users care about. The full C
76/// struct is ~1 KB with fields that are rarely accessed — we surface the
77/// hot-path ones and keep a reserved `_raw` slot for the whole buffer so
78/// advanced users can cast through.
79#[derive(Clone, Debug)]
80pub struct DeviceProperties {
81    pub name: String,
82    pub total_global_memory_bytes: u64,
83    pub shared_memory_per_block_bytes: u64,
84    pub regs_per_block: i32,
85    pub warp_size: i32,
86    pub max_threads_per_block: i32,
87    pub max_block_dim: [i32; 3],
88    pub max_grid_dim: [i32; 3],
89    pub clock_rate_khz: i32,
90    pub memory_clock_rate_khz: i32,
91    pub memory_bus_width_bits: i32,
92    pub l2_cache_size_bytes: i32,
93    pub max_threads_per_sm: i32,
94    pub multiprocessor_count: i32,
95    pub compute_capability_major: i32,
96    pub compute_capability_minor: i32,
97    pub integrated: bool,
98    pub concurrent_kernels: bool,
99    pub pci_bus_id: i32,
100    pub pci_device_id: i32,
101    pub pci_domain_id: i32,
102}
103
104/// Fetch a typed subset of `cudaDeviceProp` for `device`.
105///
106/// We use `cudaGetDeviceProperties` to pull the device name (a 256-byte
107/// char array at offset 0 — the one layout-stable field across CUDA
108/// versions) and per-attribute queries for everything else. This is
109/// more robust than struct-offset parsing because `cudaDeviceProp` has
110/// grown over CUDA versions and offsets drift silently.
111pub fn device_properties(device: &Device) -> Result<DeviceProperties> {
112    use baracuda_cuda_sys::runtime::types::cudaDeviceAttr as Attr;
113
114    let r = runtime()?;
115    let cu = r.cuda_get_device_properties()?;
116    let mut buf = vec![0u8; 2048];
117    check(unsafe { cu(buf.as_mut_ptr() as *mut core::ffi::c_void, device.ordinal()) })?;
118
119    // The name field is a 256-byte char array at offset 0 — the only
120    // field we read from the buffer. Everything else goes through
121    // cudaDeviceGetAttribute which is version-stable.
122    let name = unsafe {
123        let name_ptr = buf.as_ptr() as *const core::ffi::c_char;
124        core::ffi::CStr::from_ptr(name_ptr)
125            .to_string_lossy()
126            .into_owned()
127    };
128
129    // Total global memory isn't a per-attribute query — use cudaMemGetInfo.
130    let total_global_memory_bytes = {
131        let cu_info = r.cuda_mem_get_info()?;
132        let mut free: usize = 0;
133        let mut total: usize = 0;
134        check(unsafe { cu_info(&mut free, &mut total) })?;
135        total as u64
136    };
137
138    Ok(DeviceProperties {
139        name,
140        total_global_memory_bytes,
141        shared_memory_per_block_bytes: device
142            .attribute(Attr::MAX_SHARED_MEMORY_PER_BLOCK)
143            .unwrap_or(0) as u64,
144        regs_per_block: device.attribute(Attr::MAX_REGISTERS_PER_BLOCK).unwrap_or(0),
145        warp_size: device.attribute(Attr::WARP_SIZE).unwrap_or(0),
146        max_threads_per_block: device.attribute(Attr::MAX_THREADS_PER_BLOCK).unwrap_or(0),
147        max_block_dim: [
148            device.attribute(Attr::MAX_BLOCK_DIM_X).unwrap_or(0),
149            device.attribute(Attr::MAX_BLOCK_DIM_Y).unwrap_or(0),
150            device.attribute(Attr::MAX_BLOCK_DIM_Z).unwrap_or(0),
151        ],
152        max_grid_dim: [
153            device.attribute(Attr::MAX_GRID_DIM_X).unwrap_or(0),
154            device.attribute(Attr::MAX_GRID_DIM_Y).unwrap_or(0),
155            device.attribute(Attr::MAX_GRID_DIM_Z).unwrap_or(0),
156        ],
157        clock_rate_khz: device.attribute(Attr::CLOCK_RATE).unwrap_or(0),
158        memory_clock_rate_khz: device.attribute(Attr::CLOCK_RATE).unwrap_or(0),
159        memory_bus_width_bits: 0,
160        l2_cache_size_bytes: 0,
161        max_threads_per_sm: 0,
162        multiprocessor_count: device.attribute(Attr::MULTIPROCESSOR_COUNT).unwrap_or(0),
163        compute_capability_major: device
164            .attribute(Attr::COMPUTE_CAPABILITY_MAJOR)
165            .unwrap_or(0),
166        compute_capability_minor: device
167            .attribute(Attr::COMPUTE_CAPABILITY_MINOR)
168            .unwrap_or(0),
169        integrated: device.attribute(Attr::INTEGRATED).unwrap_or(0) != 0,
170        concurrent_kernels: device.attribute(Attr::CONCURRENT_KERNELS).unwrap_or(0) != 0,
171        pci_bus_id: device.attribute(Attr::PCI_BUS_ID).unwrap_or(0),
172        pci_device_id: device.attribute(Attr::PCI_DEVICE_ID).unwrap_or(0),
173        pci_domain_id: device.attribute(Attr::PCI_DOMAIN_ID).unwrap_or(0),
174    })
175}
176
177/// Query a kernel's register / shared-memory / PTX-version metadata.
178/// `func_symbol` is a `const void*` — the address of the kernel symbol
179/// as used by `cudaLaunchKernel` (or [`crate::Kernel::as_launch_ptr`]).
180///
181/// # Safety
182///
183/// `func_symbol` must be a valid CUDA kernel symbol address. Passing
184/// garbage causes undefined behavior inside the driver.
185pub unsafe fn func_attributes(func_symbol: *const core::ffi::c_void) -> Result<cudaFuncAttributes> { unsafe {
186    let r = runtime()?;
187    let cu = r.cuda_func_get_attributes()?;
188    let mut attrs = cudaFuncAttributes::default();
189    check(cu(
190        &mut attrs as *mut cudaFuncAttributes as *mut core::ffi::c_void,
191        func_symbol,
192    ))?;
193    Ok(attrs)
194}}