use std::{fmt, sync::Arc};
use cudarc::driver::{result, sys::CUdevice_attribute, CudaContext};
use crate::backend::staging_reserve::reserved_vec;
fn format_cuda_context_init_error(ordinal: usize, error: impl fmt::Display) -> String {
format!(
"CUDA context init failed for ordinal {ordinal}: {error}. Fix: choose a visible `nvidia-smi -L` ordinal and ensure no exclusive-process compute mode blocks context creation. If the error is CUDA_ERROR_OUT_OF_MEMORY, treat it as live VRAM pressure during context creation: run `nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv`, free or move the processes holding VRAM, then rerun the CUDA-required validation; do not skip GPU tests or continue on a CPU path."
)
}
#[cfg(test)]
mod context_init_error_tests {
use super::format_cuda_context_init_error;
#[test]
fn context_init_oom_diagnostic_names_vram_pressure_without_cpu_escape() {
let diagnostic = format_cuda_context_init_error(0, "CUDA_ERROR_OUT_OF_MEMORY");
assert!(diagnostic.contains("CUDA_ERROR_OUT_OF_MEMORY"));
assert!(diagnostic.contains("live VRAM pressure during context creation"));
assert!(diagnostic
.contains("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv"));
assert!(diagnostic.contains("do not skip GPU tests"));
assert!(diagnostic.contains("continue on a CPU path"));
}
}
#[derive(Debug, Clone)]
pub struct CudaDeviceCaps {
pub name: String,
pub ordinal: usize,
pub compute_capability: (u32, u32),
pub total_memory: u64,
pub max_threads_per_block: i32,
pub max_block_dim: [i32; 3],
pub max_grid_dim: [i32; 3],
pub shared_memory_per_block: i32,
pub shared_memory_per_sm: i32,
pub warp_size: i32,
pub cooperative_launch: bool,
pub concurrent_kernels: bool,
pub async_engine_count: i32,
pub multi_processor_count: i32,
pub l2_cache_bytes: i32,
pub memory_clock_rate_khz: i32,
pub global_memory_bus_width_bits: i32,
pub max_registers_per_block: i32,
pub max_registers_per_sm: i32,
pub max_threads_per_sm: i32,
}
#[derive(Debug, Clone)]
pub struct CudaDeviceHandle {
pub caps: CudaDeviceCaps,
pub ctx: Arc<CudaContext>,
}
impl CudaDeviceHandle {
pub fn acquire_ordinal(ordinal: usize) -> Result<Self, String> {
let device_count = CudaDeviceCaps::visible_device_count()?;
if ordinal >= device_count {
return Err(format!(
"CUDA device ordinal {ordinal} is out of range for {device_count} visible device(s). Fix: select a CUDA device ordinal reported by `nvidia-smi`."
));
}
let ctx = CudaContext::new(ordinal)
.map_err(|error| format_cuda_context_init_error(ordinal, error))?;
ctx.bind_to_thread().map_err(|e| {
format!(
"CUDA context bind failed for ordinal {ordinal}: {e}. Fix: repair CUDA context ownership before dispatch; GPU-required runs must not continue with an unbound context."
)
})?;
let caps = CudaDeviceCaps::probe_context(ordinal, &ctx)?;
Ok(Self { caps, ctx })
}
}
impl CudaDeviceCaps {
fn required_u32_capability(&self, name: &str, value: i32) -> u32 {
debug_assert!(
value > 0,
"CUDA device `{}` carried invalid {name}={value} after capability validation",
self.name
);
if value <= 0 {
tracing::error!(
"CUDA device `{}` carried invalid {name}={value} after capability validation. Fix: reject corrupt capability snapshots during probe.",
self.name
);
return 0;
}
u32::try_from(value).unwrap_or_else(|source| {
tracing::error!(
"CUDA device `{}` carried non-u32 {name}={value} after capability validation: {source}. Fix: reject corrupt capability snapshots during probe.",
self.name
);
0
})
}
pub fn visible_device_count() -> Result<usize, String> {
result::init().map_err(|e| {
format!(
"CUDA driver init failed: {e}. Fix: verify `nvidia-smi` succeeds and libcuda.so from the NVIDIA driver is visible to this process."
)
})?;
let count = result::device::get_count()
.map_err(|e| {
format!(
"CUDA device-count query failed: {e}. Fix: repair CUDA driver/device visibility; a GPU-required host must not report zero devices."
)
})?;
usize::try_from(count)
.map_err(|_| format!("CUDA device-count query returned negative value {count}"))
}
pub fn probe_all() -> Result<Vec<Self>, String> {
let device_count = Self::visible_device_count()?;
if device_count == 0 {
return Err(
"CUDA device-count query returned zero visible devices. Fix: this is a GPU-required release host; run `nvidia-smi -L`, repair CUDA_VISIBLE_DEVICES/container GPU passthrough, and do not silently continue on a CPU path."
.to_string(),
);
}
let mut devices = reserved_vec(device_count, "cuda visible device probes")
.map_err(|error| error.to_string())?;
for ordinal in 0..device_count {
devices.push(Self::probe(ordinal)?);
}
Ok(devices)
}
pub fn probe(ordinal: usize) -> Result<Self, String> {
let device_count = Self::visible_device_count()?;
if ordinal >= device_count {
return Err(format!(
"CUDA device ordinal {ordinal} is out of range for {device_count} visible device(s). Fix: select a CUDA device ordinal reported by `nvidia-smi`."
));
}
let ctx = CudaContext::new(ordinal)
.map_err(|error| format_cuda_context_init_error(ordinal, error))?;
Self::probe_context(ordinal, &ctx)
}
fn probe_context(ordinal: usize, ctx: &CudaContext) -> Result<Self, String> {
let dev = ctx.cu_device();
let attr = |name: &str, attrib| {
unsafe { result::device::get_attribute(dev, attrib) }
.map_err(|e| format!("CUDA attribute query `{name}` failed: {e}"))
};
let total_memory = unsafe { result::device::total_mem(dev) }
.map_err(|e| format!("CUDA total-memory query failed: {e}"))?;
let name = result::device::get_name(dev)
.map_err(|e| format!("CUDA device-name query failed: {e}"))?;
let major = attr(
"compute_capability_major",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
)?;
let minor = attr(
"compute_capability_minor",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
)?;
let max_threads_per_block = attr(
"max_threads_per_block",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
)?;
let max_block_dim_x = attr(
"max_block_dim_x",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
)?;
let max_block_dim_y = attr(
"max_block_dim_y",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
)?;
let max_block_dim_z = attr(
"max_block_dim_z",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
)?;
let max_grid_dim_x = attr(
"max_grid_dim_x",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
)?;
let max_grid_dim_y = attr(
"max_grid_dim_y",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
)?;
let max_grid_dim_z = attr(
"max_grid_dim_z",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
)?;
let shared_memory_per_block = attr(
"shared_memory_per_block",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
)?;
let shared_memory_per_sm = attr(
"shared_memory_per_sm",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
)?;
let warp_size = attr(
"warp_size",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE,
)?;
let cooperative_launch = attr(
"cooperative_launch",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH,
)?;
let concurrent_kernels = attr(
"concurrent_kernels",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS,
)?;
let async_engine_count = attr(
"async_engine_count",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
)?;
let multi_processor_count = attr(
"multi_processor_count",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
)?;
let l2_cache_bytes = attr(
"l2_cache_bytes",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
)?;
let memory_clock_rate_khz = attr(
"memory_clock_rate_khz",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
)?;
let global_memory_bus_width_bits = attr(
"global_memory_bus_width_bits",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
)?;
let max_registers_per_block = attr(
"max_registers_per_block",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
)?;
let max_registers_per_sm = attr(
"max_registers_per_sm",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
)?;
let max_threads_per_sm = attr(
"max_threads_per_sm",
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
)?;
let caps = Self {
name,
ordinal,
compute_capability: (
u32::try_from(major).map_err(|source| {
format!("CUDA device major compute capability was negative ({major}): {source}. Fix: repair CUDA driver attribute probing before dispatch.")
})?,
u32::try_from(minor).map_err(|source| {
format!("CUDA device minor compute capability was negative ({minor}): {source}. Fix: repair CUDA driver attribute probing before dispatch.")
})?,
),
total_memory: u64::try_from(total_memory).map_err(|source| {
format!("CUDA total memory value {total_memory} does not fit u64: {source}. Fix: widen CudaDeviceCaps memory telemetry before dispatch.")
})?,
max_threads_per_block,
max_block_dim: [max_block_dim_x, max_block_dim_y, max_block_dim_z],
max_grid_dim: [max_grid_dim_x, max_grid_dim_y, max_grid_dim_z],
shared_memory_per_block,
shared_memory_per_sm,
warp_size,
cooperative_launch: cooperative_launch != 0,
concurrent_kernels: concurrent_kernels != 0,
async_engine_count,
multi_processor_count,
l2_cache_bytes,
memory_clock_rate_khz,
global_memory_bus_width_bits,
max_registers_per_block,
max_registers_per_sm,
max_threads_per_sm,
};
caps.validate_required_attributes()?;
Ok(caps)
}
fn validate_required_attributes(&self) -> Result<(), String> {
if self.name.trim().is_empty() {
return Err(format!(
"CUDA device ordinal {} returned an empty device name. Fix: repair CUDA driver probing before capability-dependent dispatch.",
self.ordinal
));
}
if self.compute_capability.0 == 0 {
return Err(format!(
"CUDA device `{}` returned invalid compute capability {:?}. Fix: update the NVIDIA driver so CUDA attributes report a real SM target.",
self.name, self.compute_capability
));
}
if self.total_memory == 0 {
return Err(format!(
"CUDA device `{}` reported zero total memory. Fix: repair CUDA device visibility; do not continue with bogus memory limits.",
self.name
));
}
for (name, value) in [
("max_threads_per_block", self.max_threads_per_block),
("max_block_dim_x", self.max_block_dim[0]),
("max_block_dim_y", self.max_block_dim[1]),
("max_block_dim_z", self.max_block_dim[2]),
("max_grid_dim_x", self.max_grid_dim[0]),
("max_grid_dim_y", self.max_grid_dim[1]),
("max_grid_dim_z", self.max_grid_dim[2]),
("shared_memory_per_block", self.shared_memory_per_block),
("shared_memory_per_sm", self.shared_memory_per_sm),
("warp_size", self.warp_size),
("multi_processor_count", self.multi_processor_count),
("l2_cache_bytes", self.l2_cache_bytes),
("memory_clock_rate_khz", self.memory_clock_rate_khz),
(
"global_memory_bus_width_bits",
self.global_memory_bus_width_bits,
),
("max_registers_per_block", self.max_registers_per_block),
("max_registers_per_sm", self.max_registers_per_sm),
("max_threads_per_sm", self.max_threads_per_sm),
] {
if value <= 0 {
return Err(format!(
"CUDA device `{}` reported invalid {name}={value}. Fix: repair CUDA capability probing before dispatch; zero/negative limits are a hard GPU configuration error.",
self.name
));
}
}
Ok(())
}
#[must_use]
pub fn native_sm(&self) -> u32 {
self.compute_capability.0 * 10 + self.compute_capability.1
}
#[must_use]
pub fn ptx_target_sm(&self) -> u32 {
self.native_sm()
}
#[must_use]
pub fn shared_memory_per_block_bytes(&self) -> u32 {
self.required_u32_capability("shared_memory_per_block", self.shared_memory_per_block)
}
#[must_use]
pub fn shared_memory_per_sm_bytes(&self) -> u32 {
self.required_u32_capability("shared_memory_per_sm", self.shared_memory_per_sm)
}
#[must_use]
pub fn max_threads_per_block_u32(&self) -> u32 {
self.required_u32_capability("max_threads_per_block", self.max_threads_per_block)
}
#[must_use]
pub fn max_registers_per_block_u32(&self) -> u32 {
self.required_u32_capability("max_registers_per_block", self.max_registers_per_block)
}
#[must_use]
pub fn max_registers_per_sm_u32(&self) -> u32 {
self.required_u32_capability("max_registers_per_sm", self.max_registers_per_sm)
}
#[must_use]
pub fn max_threads_per_sm_u32(&self) -> u32 {
self.required_u32_capability("max_threads_per_sm", self.max_threads_per_sm)
}
#[must_use]
pub fn multi_processor_count_u32(&self) -> u32 {
self.required_u32_capability("multi_processor_count", self.multi_processor_count)
}
#[must_use]
pub fn l2_cache_bytes_u32(&self) -> u32 {
self.required_u32_capability("l2_cache_bytes", self.l2_cache_bytes)
}
#[must_use]
pub fn memory_bandwidth_gbps(&self) -> u32 {
let clock_khz =
self.required_u32_capability("memory_clock_rate_khz", self.memory_clock_rate_khz);
let bus_bits = self.required_u32_capability(
"global_memory_bus_width_bits",
self.global_memory_bus_width_bits,
);
let gbps = (u64::from(clock_khz) * u64::from(bus_bits)) / 4_000_000;
u32::try_from(gbps.max(1)).unwrap_or_else(|source| {
tracing::error!(
"CUDA device `{}` memory bandwidth {gbps} GB/s does not fit u32: {source}. Fix: normalize the bandwidth model before exporting device profile telemetry.",
self.name
);
u32::MAX
})
}
#[must_use]
pub fn max_registers_per_thread_u32(&self) -> u32 {
self.max_registers_per_block_u32().min(255)
}
#[must_use]
pub fn max_block_dim_u32(&self) -> [u32; 3] {
self.max_block_dim
.map(|value| self.required_u32_capability("max_block_dim axis", value))
}
#[must_use]
pub fn max_grid_dim_u32(&self) -> [u32; 3] {
self.max_grid_dim
.map(|value| self.required_u32_capability("max_grid_dim axis", value))
}
#[must_use]
pub fn warp_size_u32(&self) -> Option<u32> {
Some(self.required_warp_size_u32())
}
#[must_use]
pub fn required_warp_size_u32(&self) -> u32 {
self.required_u32_capability("warp_size", self.warp_size)
}
#[must_use]
pub fn hardware_supports_f16(&self) -> bool {
self.compute_capability >= (5, 3)
}
#[must_use]
pub fn hardware_supports_bf16(&self) -> bool {
self.compute_capability >= (8, 0)
}
#[must_use]
pub fn hardware_supports_tensor_cores(&self) -> bool {
self.compute_capability >= (7, 0)
}
#[must_use]
pub fn to_adapter_caps(&self) -> vyre_foundation::optimizer::AdapterCaps {
self.to_device_profile().into()
}
#[must_use]
pub fn to_device_profile(&self) -> vyre_driver::DeviceProfile {
let subgroup = self.subgroup_caps();
let profile = vyre_driver::DeviceProfile {
backend: "cuda",
supports_subgroup_ops: subgroup.supports_subgroup,
supports_indirect_dispatch: self.cooperative_launch,
supports_distributed_collectives: false,
supports_specialization_constants: false,
supports_f16: self.hardware_supports_f16(),
supports_bf16: self.hardware_supports_bf16(),
supports_trap_propagation: true,
supports_tensor_cores: self.hardware_supports_tensor_cores(),
has_mul_high: true,
has_dual_issue_fp32_int32: true,
has_subgroup_shuffle: subgroup.supports_subgroup,
has_shared_memory: self.shared_memory_per_block_bytes() > 0,
max_native_int_width: 64,
max_workgroup_size: self.max_block_dim_u32(),
max_invocations_per_workgroup: self.max_threads_per_block_u32(),
max_shared_memory_bytes: self.shared_memory_per_block_bytes(),
max_storage_buffer_binding_size: self.total_memory,
subgroup_size: subgroup.subgroup_size,
compute_units: self.multi_processor_count_u32(),
regs_per_thread_max: self.max_registers_per_thread_u32(),
l1_cache_bytes: 0,
l2_cache_bytes: self.l2_cache_bytes_u32(),
mem_bw_gbps: self.memory_bandwidth_gbps(),
ideal_unroll_depth: 0,
ideal_vector_pack_bits: 0,
ideal_workgroup_tile: [0, 0, 0],
shared_memory_bank_count: 32,
shared_memory_bank_width_bytes: 4,
};
vyre_driver::DeviceSignatureTable::builtins().map_or(profile, |table| {
table.apply_generation_to_profile(self.native_sm(), profile)
})
}
#[must_use]
pub fn subgroup_caps(&self) -> vyre_driver::SubgroupCaps {
vyre_driver::SubgroupCaps::native(self.required_warp_size_u32())
}
}
#[cfg(test)]
mod tests {
use crate::synthetic_device_caps::blackwell_sm120_caps_default;
#[test]
fn cuda_profile_applies_builtin_sm_signature() {
let profile = blackwell_sm120_caps_default().to_device_profile();
let table =
vyre_driver::DeviceSignatureTable::builtins().expect("Fix: builtin signatures load");
let signature = table
.find_architecture_generation(120)
.expect("Fix: SM_120 must match the builtin Blackwell signature");
assert_eq!(profile.compute_units, 170);
assert_eq!(profile.ideal_unroll_depth, signature.ideal_unroll_depth);
assert_eq!(
profile.ideal_vector_pack_bits,
signature.ideal_vector_pack_bits
);
assert_eq!(profile.ideal_workgroup_tile, signature.ideal_workgroup_tile);
assert_eq!(profile.shared_memory_bank_count, signature.bank_count);
}
#[test]
fn cuda_profile_preserves_probed_compute_units_without_builtin_signature() {
let mut caps = blackwell_sm120_caps_default();
caps.compute_capability = (99, 0);
caps.multi_processor_count = 13;
let profile = caps.to_device_profile();
assert_eq!(profile.compute_units, 13);
assert_eq!(profile.regs_per_thread_max, 255);
assert_eq!(profile.l2_cache_bytes, 96 * 1024 * 1024);
assert_eq!(profile.mem_bw_gbps, 1792);
assert_eq!(profile.max_invocations_per_workgroup, 1024);
assert_eq!(profile.max_shared_memory_bytes, 128 * 1024);
assert_eq!(caps.shared_memory_per_sm_bytes(), 256 * 1024);
assert_eq!(profile.shared_memory_bank_count, 32);
assert_eq!(profile.shared_memory_bank_width_bytes, 4);
}
#[test]
fn cuda_probe_all_rejects_zero_device_silent_fallback_by_contract() {
let source = include_str!("device.rs");
assert!(
source.contains("device_count == 0")
&& source.contains("do not silently continue on a CPU path"),
"Fix: CUDA device discovery must fail loudly when a GPU-required host reports zero visible devices."
);
assert!(
!source.contains(concat!("(0..device_count)", ".map(Self::probe).collect()")),
"Fix: CUDA device discovery must not hide zero devices behind an empty successful probe list."
);
}
#[test]
fn cuda_capability_conversion_has_no_production_panic_path() {
let source = include_str!("device.rs");
let start = source
.find("fn required_u32_capability")
.expect("Fix: CUDA capability conversion helper must exist");
let end = source[start..]
.find("/// Return the number of CUDA devices visible")
.expect("Fix: CUDA capability conversion helper must stay before device discovery")
+ start;
let helper = &source[start..end];
assert!(
!helper.contains("panic!("),
"Fix: CUDA capability accessors must not abort production dispatch; probe-time validation must reject invalid capability snapshots with typed errors."
);
assert!(
!helper.contains("unwrap_or(1)"),
"Fix: CUDA capability accessors must not manufacture fake nonzero defaults after validation."
);
assert!(
!helper.contains(" as u32"),
"Fix: CUDA capability accessors must use checked integer conversion, not release-path narrowing casts."
);
let bandwidth_start = source
.find("pub fn memory_bandwidth_gbps")
.expect("Fix: CUDA memory bandwidth helper must exist");
let bandwidth_end = source[bandwidth_start..]
.find("/// NVIDIA CUDA architectural register ceiling")
.expect("Fix: CUDA memory bandwidth helper should precede register helper")
+ bandwidth_start;
let bandwidth_helper = &source[bandwidth_start..bandwidth_end];
assert!(
!bandwidth_helper.contains(" as u32"),
"Fix: CUDA bandwidth telemetry must not narrow with an unchecked cast."
);
assert!(
bandwidth_helper.contains("u32::try_from"),
"Fix: CUDA bandwidth telemetry must use checked conversion after widened arithmetic."
);
assert!(
source.contains("caps.validate_required_attributes()?"),
"Fix: CUDA capability probing must validate launch-critical values before exposing infallible accessors."
);
assert!(
source.contains("CUDA device major compute capability was negative")
&& source.contains("map_err(|source|"),
"Fix: CUDA capability probe must return typed errors for corrupt driver attributes instead of panicking."
);
}
}