use crate::numeric::CUDA_NUMERIC;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaKernelDeviceEnvelope {
pub sm_major: u16,
pub sm_minor: u16,
pub max_threads_per_block: u32,
pub shared_memory_per_block_bytes: u64,
pub supports_cooperative_launch: bool,
pub supports_tensor_cores: bool,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaKernelRequirement {
pub min_sm_major: u16,
pub min_sm_minor: u16,
pub requested_threads_per_block: u32,
pub requested_shared_memory_bytes: u64,
pub requires_cooperative_launch: bool,
pub requires_tensor_cores: bool,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaKernelLaunchShape {
pub grid: [u32; 3],
pub block: [u32; 3],
pub dynamic_shared_memory_bytes: u32,
pub cooperative: bool,
pub requires_tensor_cores: bool,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct CudaKernelLaunchEnvelope {
pub kernel: &'static str,
pub device: CudaKernelDeviceEnvelope,
pub requirement: CudaKernelRequirement,
pub shape: CudaKernelLaunchShape,
pub grid_blocks: u64,
pub threads_per_block: u32,
pub cooperative_resident_block_limit: Option<u64>,
pub diagnostic: CudaKernelLaunchDiagnostic,
}
impl CudaKernelLaunchEnvelope {
#[must_use]
pub fn is_launchable(&self) -> bool {
self.diagnostic.is_launchable()
&& self
.cooperative_resident_block_limit
.is_none_or(|limit| self.grid_blocks <= limit)
}
#[must_use]
pub fn stable_message(&self) -> String {
let mut message = self.diagnostic.stable_message();
push_launch_envelope_suffix(self, &mut message);
message
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct CudaKernelLaunchEnvelopeError {
pub fix: String,
}
impl std::fmt::Display for CudaKernelLaunchEnvelopeError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.fix)
}
}
impl std::error::Error for CudaKernelLaunchEnvelopeError {}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum CudaKernelCapabilityFailure {
SmVersion {
required_major: u16,
required_minor: u16,
actual_major: u16,
actual_minor: u16,
},
ThreadsPerBlock {
requested: u32,
maximum: u32,
},
SharedMemory {
requested: u64,
maximum: u64,
},
CooperativeLaunch,
TensorCores,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct CudaKernelLaunchDiagnostic {
pub kernel: &'static str,
pub failures: Vec<CudaKernelCapabilityFailure>,
}
impl CudaKernelLaunchDiagnostic {
#[must_use]
pub fn is_launchable(&self) -> bool {
self.failures.is_empty()
}
#[must_use]
pub fn stable_message(&self) -> String {
let mut message = String::new();
write_stable_message(self.kernel, &self.failures, &mut message);
message
}
pub fn stable_message_into(&self, out: &mut String) {
write_stable_message(self.kernel, &self.failures, out);
}
}
#[derive(Debug, Default)]
pub struct CudaKernelLaunchDiagnosticScratch {
failures: Vec<CudaKernelCapabilityFailure>,
message: String,
}
impl CudaKernelLaunchDiagnosticScratch {
pub fn diagnose_stable_message(
&mut self,
kernel: &'static str,
device: CudaKernelDeviceEnvelope,
requirement: CudaKernelRequirement,
) -> &str {
record_cuda_kernel_launch_failures(device, requirement, &mut self.failures);
write_stable_message(kernel, &self.failures, &mut self.message);
&self.message
}
pub fn stable_message_for_failures(
&mut self,
kernel: &'static str,
failures: &[CudaKernelCapabilityFailure],
) -> &str {
write_stable_message(kernel, failures, &mut self.message);
&self.message
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaKernelLaunchDiagnosticRef<'a> {
pub kernel: &'static str,
pub failures: &'a [CudaKernelCapabilityFailure],
}
impl CudaKernelLaunchDiagnosticRef<'_> {
#[must_use]
pub fn is_launchable(&self) -> bool {
self.failures.is_empty()
}
}
#[must_use]
pub fn diagnose_cuda_kernel_launch(
kernel: &'static str,
device: CudaKernelDeviceEnvelope,
requirement: CudaKernelRequirement,
) -> CudaKernelLaunchDiagnostic {
let mut scratch = CudaKernelLaunchDiagnosticScratch::default();
let kernel = {
let diagnostic =
diagnose_cuda_kernel_launch_with_scratch(kernel, device, requirement, &mut scratch);
diagnostic.kernel
};
CudaKernelLaunchDiagnostic {
kernel,
failures: scratch.failures,
}
}
pub fn diagnose_cuda_kernel_launch_shape(
kernel: &'static str,
device: CudaKernelDeviceEnvelope,
shape: CudaKernelLaunchShape,
cooperative_resident_block_limit: Option<u64>,
) -> Result<CudaKernelLaunchEnvelope, CudaKernelLaunchEnvelopeError> {
let grid_blocks = checked_dim_product_u64(shape.grid, "grid block count")?;
let threads_per_block = checked_dim_product_u32(shape.block, "threads per block")?;
let requirement = CudaKernelRequirement {
min_sm_major: 0,
min_sm_minor: 0,
requested_threads_per_block: threads_per_block,
requested_shared_memory_bytes: u64::from(shape.dynamic_shared_memory_bytes),
requires_cooperative_launch: shape.cooperative,
requires_tensor_cores: shape.requires_tensor_cores,
};
let diagnostic = diagnose_cuda_kernel_launch(kernel, device, requirement);
Ok(CudaKernelLaunchEnvelope {
kernel,
device,
requirement,
shape,
grid_blocks,
threads_per_block,
cooperative_resident_block_limit,
diagnostic,
})
}
pub fn diagnose_cuda_kernel_launch_with_scratch<'a>(
kernel: &'static str,
device: CudaKernelDeviceEnvelope,
requirement: CudaKernelRequirement,
scratch: &'a mut CudaKernelLaunchDiagnosticScratch,
) -> CudaKernelLaunchDiagnosticRef<'a> {
record_cuda_kernel_launch_failures(device, requirement, &mut scratch.failures);
CudaKernelLaunchDiagnosticRef {
kernel,
failures: &scratch.failures,
}
}
fn record_cuda_kernel_launch_failures(
device: CudaKernelDeviceEnvelope,
requirement: CudaKernelRequirement,
failures: &mut Vec<CudaKernelCapabilityFailure>,
) {
failures.clear();
if (device.sm_major, device.sm_minor) < (requirement.min_sm_major, requirement.min_sm_minor) {
failures.push(CudaKernelCapabilityFailure::SmVersion {
required_major: requirement.min_sm_major,
required_minor: requirement.min_sm_minor,
actual_major: device.sm_major,
actual_minor: device.sm_minor,
});
}
if requirement.requested_threads_per_block > device.max_threads_per_block {
failures.push(CudaKernelCapabilityFailure::ThreadsPerBlock {
requested: requirement.requested_threads_per_block,
maximum: device.max_threads_per_block,
});
}
if requirement.requested_shared_memory_bytes > device.shared_memory_per_block_bytes {
failures.push(CudaKernelCapabilityFailure::SharedMemory {
requested: requirement.requested_shared_memory_bytes,
maximum: device.shared_memory_per_block_bytes,
});
}
if requirement.requires_cooperative_launch && !device.supports_cooperative_launch {
failures.push(CudaKernelCapabilityFailure::CooperativeLaunch);
}
if requirement.requires_tensor_cores && !device.supports_tensor_cores {
failures.push(CudaKernelCapabilityFailure::TensorCores);
}
}
fn write_stable_message(
kernel: &'static str,
failures: &[CudaKernelCapabilityFailure],
out: &mut String,
) {
use std::fmt::Write as _;
out.clear();
let _ = write!(out, "cuda-kernel-capability-v1|kernel={kernel}|status=");
if failures.is_empty() {
out.push_str("ok");
return;
}
out.push_str("blocked|fix=");
for (index, failure) in failures.iter().enumerate() {
if index > 0 {
out.push(',');
}
match failure {
CudaKernelCapabilityFailure::SmVersion {
required_major,
required_minor,
actual_major,
actual_minor,
} => {
let _ = write!(
out,
"sm_version(required={required_major}.{required_minor},actual={actual_major}.{actual_minor})"
);
}
CudaKernelCapabilityFailure::ThreadsPerBlock { requested, maximum } => {
let _ = write!(
out,
"threads_per_block(requested={requested},max={maximum})"
);
}
CudaKernelCapabilityFailure::SharedMemory { requested, maximum } => {
let _ = write!(out, "shared_memory(requested={requested},max={maximum})");
}
CudaKernelCapabilityFailure::CooperativeLaunch => out.push_str("cooperative_launch"),
CudaKernelCapabilityFailure::TensorCores => out.push_str("tensor_cores"),
}
}
}
fn push_launch_envelope_suffix(envelope: &CudaKernelLaunchEnvelope, out: &mut String) {
use std::fmt::Write as _;
let _ = write!(
out,
"|grid={:?}|block={:?}|grid_blocks={}|threads_per_block={}|dynamic_shared_bytes={}",
envelope.shape.grid,
envelope.shape.block,
envelope.grid_blocks,
envelope.threads_per_block,
envelope.shape.dynamic_shared_memory_bytes
);
if let Some(limit) = envelope.cooperative_resident_block_limit {
let _ = write!(out, "|cooperative_resident_block_limit={limit}");
if envelope.grid_blocks > limit {
let _ = write!(
out,
"|cooperative_residency=blocked(required={},limit={})",
envelope.grid_blocks, limit
);
}
}
}
fn checked_dim_product_u64(
dims: [u32; 3],
label: &'static str,
) -> Result<u64, CudaKernelLaunchEnvelopeError> {
CUDA_NUMERIC.checked_dim_product_u64(dims).ok_or_else(|| {
CudaKernelLaunchEnvelopeError {
fix: format!(
"CUDA launch envelope {label} overflowed u64 for dimensions {dims:?}. Fix: shard the launch before release diagnostics."
),
}
})
}
fn checked_dim_product_u32(
dims: [u32; 3],
label: &'static str,
) -> Result<u32, CudaKernelLaunchEnvelopeError> {
CUDA_NUMERIC.checked_dim_product_u32(dims).ok_or_else(|| {
let product = checked_dim_product_u64(dims, label).map_or_else(
|_| "overflowed u64".to_string(),
|value| value.to_string(),
);
CudaKernelLaunchEnvelopeError {
fix: format!(
"CUDA launch envelope {label} value {product} cannot fit u32. Fix: lower block dimensions before launch."
),
}
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn diagnostic_accepts_satisfied_kernel_requirements() {
let diagnostic = diagnose_cuda_kernel_launch(
"frontier",
device(),
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 256,
requested_shared_memory_bytes: 32_768,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
);
assert!(diagnostic.is_launchable());
assert_eq!(
diagnostic.stable_message(),
"cuda-kernel-capability-v1|kernel=frontier|status=ok"
);
}
#[test]
fn diagnostic_reports_every_missing_requirement() {
let diagnostic = diagnose_cuda_kernel_launch(
"frontier",
CudaKernelDeviceEnvelope {
sm_major: 8,
sm_minor: 6,
max_threads_per_block: 512,
shared_memory_per_block_bytes: 16_384,
supports_cooperative_launch: false,
supports_tensor_cores: false,
},
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 1_024,
requested_shared_memory_bytes: 65_536,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
);
assert!(!diagnostic.is_launchable());
assert_eq!(diagnostic.failures.len(), 5);
let message = diagnostic.stable_message();
assert!(message.contains("sm_version(required=9.0,actual=8.6)"));
assert!(message.contains("threads_per_block(requested=1024,max=512)"));
assert!(message.contains("shared_memory(requested=65536,max=16384)"));
assert!(message.contains("cooperative_launch"));
assert!(message.contains("tensor_cores"));
}
#[test]
fn launch_envelope_records_shape_residency_and_stable_message() {
let envelope = diagnose_cuda_kernel_launch_shape(
"frontier",
device(),
CudaKernelLaunchShape {
grid: [9, 2, 1],
block: [128, 2, 1],
dynamic_shared_memory_bytes: 32_768,
cooperative: true,
requires_tensor_cores: true,
},
Some(16),
)
.expect("Fix: valid CUDA launch envelope should derive");
assert_eq!(envelope.grid_blocks, 18);
assert_eq!(envelope.threads_per_block, 256);
assert!(!envelope.is_launchable());
let message = envelope.stable_message();
assert!(message.contains("cuda-kernel-capability-v1|kernel=frontier"));
assert!(message.contains("grid_blocks=18"));
assert!(message.contains("threads_per_block=256"));
assert!(message.contains("cooperative_residency=blocked(required=18,limit=16)"));
}
#[test]
fn launch_envelope_rejects_thread_block_product_overflow() {
let error = diagnose_cuda_kernel_launch_shape(
"frontier",
device(),
CudaKernelLaunchShape {
grid: [1, 1, 1],
block: [u32::MAX, u32::MAX, 2],
dynamic_shared_memory_bytes: 0,
cooperative: false,
requires_tensor_cores: false,
},
None,
)
.expect_err("oversized CUDA block shape must fail before diagnostics");
assert!(error.fix.contains("threads per block"));
}
#[test]
fn diagnostic_scratch_reuses_failure_and_message_storage() {
let mut scratch = CudaKernelLaunchDiagnosticScratch::default();
let failures_ptr = {
let blocked = diagnose_cuda_kernel_launch_with_scratch(
"frontier",
CudaKernelDeviceEnvelope {
sm_major: 8,
sm_minor: 6,
max_threads_per_block: 512,
shared_memory_per_block_bytes: 16_384,
supports_cooperative_launch: false,
supports_tensor_cores: false,
},
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 1_024,
requested_shared_memory_bytes: 65_536,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
&mut scratch,
);
assert!(!blocked.is_launchable());
assert_eq!(blocked.failures.len(), 5);
blocked.failures.as_ptr()
};
let message = scratch.diagnose_stable_message(
"frontier",
CudaKernelDeviceEnvelope {
sm_major: 8,
sm_minor: 6,
max_threads_per_block: 512,
shared_memory_per_block_bytes: 16_384,
supports_cooperative_launch: false,
supports_tensor_cores: false,
},
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 1_024,
requested_shared_memory_bytes: 65_536,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
);
assert!(message.contains("status=blocked"));
let message_ptr = message.as_ptr();
let launchable_failures_ptr = {
let launchable = diagnose_cuda_kernel_launch_with_scratch(
"frontier",
device(),
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 256,
requested_shared_memory_bytes: 32_768,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
&mut scratch,
);
assert!(launchable.is_launchable());
launchable.failures.as_ptr()
};
assert_eq!(launchable_failures_ptr, failures_ptr);
let message = scratch.diagnose_stable_message(
"frontier",
device(),
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 256,
requested_shared_memory_bytes: 32_768,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
);
assert_eq!(
message,
"cuda-kernel-capability-v1|kernel=frontier|status=ok"
);
assert_eq!(
message.as_ptr(),
message_ptr,
"Fix: repeated CUDA launch diagnostics must reuse caller-owned message storage instead of allocating one string per failure and joining them."
);
}
fn device() -> CudaKernelDeviceEnvelope {
CudaKernelDeviceEnvelope {
sm_major: 12,
sm_minor: 0,
max_threads_per_block: 1_024,
shared_memory_per_block_bytes: 99_840,
supports_cooperative_launch: true,
supports_tensor_cores: true,
}
}
}
#[cfg(test)]
mod owned_diagnostic_allocation_tests {
use super::*;
#[test]
fn owned_diagnostic_moves_failures_out_of_scratch_without_clone() {
let diagnostic = diagnose_cuda_kernel_launch(
"frontier",
CudaKernelDeviceEnvelope {
sm_major: 8,
sm_minor: 9,
max_threads_per_block: 512,
shared_memory_per_block_bytes: 32_768,
supports_cooperative_launch: false,
supports_tensor_cores: false,
},
CudaKernelRequirement {
min_sm_major: 9,
min_sm_minor: 0,
requested_threads_per_block: 1_024,
requested_shared_memory_bytes: 65_536,
requires_cooperative_launch: true,
requires_tensor_cores: true,
},
);
assert_eq!(diagnostic.failures.len(), 5);
let source = include_str!("kernel_failure_diagnostics.rs");
let production = source
.split("#[cfg(test)]")
.next()
.expect("Fix: CUDA diagnostic production source must be present before tests");
assert!(
!production.contains(".to_vec()"),
"Fix: owned CUDA launch diagnostics must move the scratch failure vector instead of cloning it."
);
assert!(
production.contains("use crate::numeric::CUDA_NUMERIC;")
&& production.contains("CUDA_NUMERIC.checked_dim_product_u64(dims)")
&& production.contains("CUDA_NUMERIC.checked_dim_product_u32(dims)")
&& !production.contains(concat!(
"vyre_driver::numeric::",
"checked_dim_product"
)),
"Fix: CUDA launch-envelope dimension products must route through the shared CUDA numeric policy."
);
}
}