1use crate::numeric::CUDA_NUMERIC;
4
5#[derive(Clone, Copy, Debug, Eq, PartialEq)]
7pub struct CudaKernelDeviceEnvelope {
8 pub sm_major: u16,
10 pub sm_minor: u16,
12 pub max_threads_per_block: u32,
14 pub shared_memory_per_block_bytes: u64,
16 pub supports_cooperative_launch: bool,
18 pub supports_tensor_cores: bool,
20}
21
22#[derive(Clone, Copy, Debug, Eq, PartialEq)]
24pub struct CudaKernelRequirement {
25 pub min_sm_major: u16,
27 pub min_sm_minor: u16,
29 pub requested_threads_per_block: u32,
31 pub requested_shared_memory_bytes: u64,
33 pub requires_cooperative_launch: bool,
35 pub requires_tensor_cores: bool,
37}
38
39#[derive(Clone, Copy, Debug, Eq, PartialEq)]
41pub struct CudaKernelLaunchShape {
42 pub grid: [u32; 3],
44 pub block: [u32; 3],
46 pub dynamic_shared_memory_bytes: u32,
48 pub cooperative: bool,
50 pub requires_tensor_cores: bool,
52}
53
54#[derive(Clone, Debug, Eq, PartialEq)]
57pub struct CudaKernelLaunchEnvelope {
58 pub kernel: &'static str,
60 pub device: CudaKernelDeviceEnvelope,
62 pub requirement: CudaKernelRequirement,
64 pub shape: CudaKernelLaunchShape,
66 pub grid_blocks: u64,
68 pub threads_per_block: u32,
70 pub cooperative_resident_block_limit: Option<u64>,
72 pub diagnostic: CudaKernelLaunchDiagnostic,
74}
75
76impl CudaKernelLaunchEnvelope {
77 #[must_use]
79 pub fn is_launchable(&self) -> bool {
80 self.diagnostic.is_launchable()
81 && self
82 .cooperative_resident_block_limit
83 .is_none_or(|limit| self.grid_blocks <= limit)
84 }
85
86 #[must_use]
88 pub fn stable_message(&self) -> String {
89 let mut message = self.diagnostic.stable_message();
90 push_launch_envelope_suffix(self, &mut message);
91 message
92 }
93}
94
95#[derive(Clone, Debug, Eq, PartialEq)]
97pub struct CudaKernelLaunchEnvelopeError {
98 pub fix: String,
100}
101
102impl std::fmt::Display for CudaKernelLaunchEnvelopeError {
103 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104 f.write_str(&self.fix)
105 }
106}
107
108impl std::error::Error for CudaKernelLaunchEnvelopeError {}
109
110#[derive(Clone, Debug, Eq, PartialEq)]
112pub enum CudaKernelCapabilityFailure {
113 SmVersion {
115 required_major: u16,
117 required_minor: u16,
119 actual_major: u16,
121 actual_minor: u16,
123 },
124 ThreadsPerBlock {
126 requested: u32,
128 maximum: u32,
130 },
131 SharedMemory {
133 requested: u64,
135 maximum: u64,
137 },
138 CooperativeLaunch,
140 TensorCores,
142}
143
144#[derive(Clone, Debug, Eq, PartialEq)]
146pub struct CudaKernelLaunchDiagnostic {
147 pub kernel: &'static str,
149 pub failures: Vec<CudaKernelCapabilityFailure>,
151}
152
153impl CudaKernelLaunchDiagnostic {
154 #[must_use]
156 pub fn is_launchable(&self) -> bool {
157 self.failures.is_empty()
158 }
159
160 #[must_use]
162 pub fn stable_message(&self) -> String {
163 let mut message = String::new();
164 write_stable_message(self.kernel, &self.failures, &mut message);
165 message
166 }
167
168 pub fn stable_message_into(&self, out: &mut String) {
170 write_stable_message(self.kernel, &self.failures, out);
171 }
172}
173
174#[derive(Debug, Default)]
176pub struct CudaKernelLaunchDiagnosticScratch {
177 failures: Vec<CudaKernelCapabilityFailure>,
178 message: String,
179}
180
181impl CudaKernelLaunchDiagnosticScratch {
182 pub fn diagnose_stable_message(
184 &mut self,
185 kernel: &'static str,
186 device: CudaKernelDeviceEnvelope,
187 requirement: CudaKernelRequirement,
188 ) -> &str {
189 record_cuda_kernel_launch_failures(device, requirement, &mut self.failures);
190 write_stable_message(kernel, &self.failures, &mut self.message);
191 &self.message
192 }
193
194 pub fn stable_message_for_failures(
196 &mut self,
197 kernel: &'static str,
198 failures: &[CudaKernelCapabilityFailure],
199 ) -> &str {
200 write_stable_message(kernel, failures, &mut self.message);
201 &self.message
202 }
203}
204
205#[derive(Clone, Copy, Debug, Eq, PartialEq)]
207pub struct CudaKernelLaunchDiagnosticRef<'a> {
208 pub kernel: &'static str,
210 pub failures: &'a [CudaKernelCapabilityFailure],
212}
213
214impl CudaKernelLaunchDiagnosticRef<'_> {
215 #[must_use]
217 pub fn is_launchable(&self) -> bool {
218 self.failures.is_empty()
219 }
220}
221
222#[must_use]
224pub fn diagnose_cuda_kernel_launch(
225 kernel: &'static str,
226 device: CudaKernelDeviceEnvelope,
227 requirement: CudaKernelRequirement,
228) -> CudaKernelLaunchDiagnostic {
229 let mut scratch = CudaKernelLaunchDiagnosticScratch::default();
230 let kernel = {
231 let diagnostic =
232 diagnose_cuda_kernel_launch_with_scratch(kernel, device, requirement, &mut scratch);
233 diagnostic.kernel
234 };
235 CudaKernelLaunchDiagnostic {
236 kernel,
237 failures: scratch.failures,
238 }
239}
240
241pub fn diagnose_cuda_kernel_launch_shape(
249 kernel: &'static str,
250 device: CudaKernelDeviceEnvelope,
251 shape: CudaKernelLaunchShape,
252 cooperative_resident_block_limit: Option<u64>,
253) -> Result<CudaKernelLaunchEnvelope, CudaKernelLaunchEnvelopeError> {
254 let grid_blocks = checked_dim_product_u64(shape.grid, "grid block count")?;
255 let threads_per_block = checked_dim_product_u32(shape.block, "threads per block")?;
256 let requirement = CudaKernelRequirement {
257 min_sm_major: 0,
258 min_sm_minor: 0,
259 requested_threads_per_block: threads_per_block,
260 requested_shared_memory_bytes: u64::from(shape.dynamic_shared_memory_bytes),
261 requires_cooperative_launch: shape.cooperative,
262 requires_tensor_cores: shape.requires_tensor_cores,
263 };
264 let diagnostic = diagnose_cuda_kernel_launch(kernel, device, requirement);
265 Ok(CudaKernelLaunchEnvelope {
266 kernel,
267 device,
268 requirement,
269 shape,
270 grid_blocks,
271 threads_per_block,
272 cooperative_resident_block_limit,
273 diagnostic,
274 })
275}
276
277pub fn diagnose_cuda_kernel_launch_with_scratch<'a>(
279 kernel: &'static str,
280 device: CudaKernelDeviceEnvelope,
281 requirement: CudaKernelRequirement,
282 scratch: &'a mut CudaKernelLaunchDiagnosticScratch,
283) -> CudaKernelLaunchDiagnosticRef<'a> {
284 record_cuda_kernel_launch_failures(device, requirement, &mut scratch.failures);
285
286 CudaKernelLaunchDiagnosticRef {
287 kernel,
288 failures: &scratch.failures,
289 }
290}
291
292fn record_cuda_kernel_launch_failures(
293 device: CudaKernelDeviceEnvelope,
294 requirement: CudaKernelRequirement,
295 failures: &mut Vec<CudaKernelCapabilityFailure>,
296) {
297 failures.clear();
298 if (device.sm_major, device.sm_minor) < (requirement.min_sm_major, requirement.min_sm_minor) {
299 failures.push(CudaKernelCapabilityFailure::SmVersion {
300 required_major: requirement.min_sm_major,
301 required_minor: requirement.min_sm_minor,
302 actual_major: device.sm_major,
303 actual_minor: device.sm_minor,
304 });
305 }
306 if requirement.requested_threads_per_block > device.max_threads_per_block {
307 failures.push(CudaKernelCapabilityFailure::ThreadsPerBlock {
308 requested: requirement.requested_threads_per_block,
309 maximum: device.max_threads_per_block,
310 });
311 }
312 if requirement.requested_shared_memory_bytes > device.shared_memory_per_block_bytes {
313 failures.push(CudaKernelCapabilityFailure::SharedMemory {
314 requested: requirement.requested_shared_memory_bytes,
315 maximum: device.shared_memory_per_block_bytes,
316 });
317 }
318 if requirement.requires_cooperative_launch && !device.supports_cooperative_launch {
319 failures.push(CudaKernelCapabilityFailure::CooperativeLaunch);
320 }
321 if requirement.requires_tensor_cores && !device.supports_tensor_cores {
322 failures.push(CudaKernelCapabilityFailure::TensorCores);
323 }
324}
325
326fn write_stable_message(
327 kernel: &'static str,
328 failures: &[CudaKernelCapabilityFailure],
329 out: &mut String,
330) {
331 use std::fmt::Write as _;
332
333 out.clear();
334 let _ = write!(out, "cuda-kernel-capability-v1|kernel={kernel}|status=");
335 if failures.is_empty() {
336 out.push_str("ok");
337 return;
338 }
339 out.push_str("blocked|fix=");
340 for (index, failure) in failures.iter().enumerate() {
341 if index > 0 {
342 out.push(',');
343 }
344 match failure {
345 CudaKernelCapabilityFailure::SmVersion {
346 required_major,
347 required_minor,
348 actual_major,
349 actual_minor,
350 } => {
351 let _ = write!(
352 out,
353 "sm_version(required={required_major}.{required_minor},actual={actual_major}.{actual_minor})"
354 );
355 }
356 CudaKernelCapabilityFailure::ThreadsPerBlock { requested, maximum } => {
357 let _ = write!(
358 out,
359 "threads_per_block(requested={requested},max={maximum})"
360 );
361 }
362 CudaKernelCapabilityFailure::SharedMemory { requested, maximum } => {
363 let _ = write!(out, "shared_memory(requested={requested},max={maximum})");
364 }
365 CudaKernelCapabilityFailure::CooperativeLaunch => out.push_str("cooperative_launch"),
366 CudaKernelCapabilityFailure::TensorCores => out.push_str("tensor_cores"),
367 }
368 }
369}
370
371fn push_launch_envelope_suffix(envelope: &CudaKernelLaunchEnvelope, out: &mut String) {
372 use std::fmt::Write as _;
373
374 let _ = write!(
375 out,
376 "|grid={:?}|block={:?}|grid_blocks={}|threads_per_block={}|dynamic_shared_bytes={}",
377 envelope.shape.grid,
378 envelope.shape.block,
379 envelope.grid_blocks,
380 envelope.threads_per_block,
381 envelope.shape.dynamic_shared_memory_bytes
382 );
383 if let Some(limit) = envelope.cooperative_resident_block_limit {
384 let _ = write!(out, "|cooperative_resident_block_limit={limit}");
385 if envelope.grid_blocks > limit {
386 let _ = write!(
387 out,
388 "|cooperative_residency=blocked(required={},limit={})",
389 envelope.grid_blocks, limit
390 );
391 }
392 }
393}
394
395fn checked_dim_product_u64(
396 dims: [u32; 3],
397 label: &'static str,
398) -> Result<u64, CudaKernelLaunchEnvelopeError> {
399 CUDA_NUMERIC.checked_dim_product_u64(dims).ok_or_else(|| {
400 CudaKernelLaunchEnvelopeError {
401 fix: format!(
402 "CUDA launch envelope {label} overflowed u64 for dimensions {dims:?}. Fix: shard the launch before release diagnostics."
403 ),
404 }
405 })
406}
407
408fn checked_dim_product_u32(
409 dims: [u32; 3],
410 label: &'static str,
411) -> Result<u32, CudaKernelLaunchEnvelopeError> {
412 CUDA_NUMERIC.checked_dim_product_u32(dims).ok_or_else(|| {
413 let product = checked_dim_product_u64(dims, label).map_or_else(
414 |_| "overflowed u64".to_string(),
415 |value| value.to_string(),
416 );
417 CudaKernelLaunchEnvelopeError {
418 fix: format!(
419 "CUDA launch envelope {label} value {product} cannot fit u32. Fix: lower block dimensions before launch."
420 ),
421 }
422 })
423}
424
425#[cfg(test)]
426mod tests {
427 use super::*;
428
429 #[test]
430 fn diagnostic_accepts_satisfied_kernel_requirements() {
431 let diagnostic = diagnose_cuda_kernel_launch(
432 "frontier",
433 device(),
434 CudaKernelRequirement {
435 min_sm_major: 9,
436 min_sm_minor: 0,
437 requested_threads_per_block: 256,
438 requested_shared_memory_bytes: 32_768,
439 requires_cooperative_launch: true,
440 requires_tensor_cores: true,
441 },
442 );
443
444 assert!(diagnostic.is_launchable());
445 assert_eq!(
446 diagnostic.stable_message(),
447 "cuda-kernel-capability-v1|kernel=frontier|status=ok"
448 );
449 }
450
451 #[test]
452 fn diagnostic_reports_every_missing_requirement() {
453 let diagnostic = diagnose_cuda_kernel_launch(
454 "frontier",
455 CudaKernelDeviceEnvelope {
456 sm_major: 8,
457 sm_minor: 6,
458 max_threads_per_block: 512,
459 shared_memory_per_block_bytes: 16_384,
460 supports_cooperative_launch: false,
461 supports_tensor_cores: false,
462 },
463 CudaKernelRequirement {
464 min_sm_major: 9,
465 min_sm_minor: 0,
466 requested_threads_per_block: 1_024,
467 requested_shared_memory_bytes: 65_536,
468 requires_cooperative_launch: true,
469 requires_tensor_cores: true,
470 },
471 );
472
473 assert!(!diagnostic.is_launchable());
474 assert_eq!(diagnostic.failures.len(), 5);
475 let message = diagnostic.stable_message();
476 assert!(message.contains("sm_version(required=9.0,actual=8.6)"));
477 assert!(message.contains("threads_per_block(requested=1024,max=512)"));
478 assert!(message.contains("shared_memory(requested=65536,max=16384)"));
479 assert!(message.contains("cooperative_launch"));
480 assert!(message.contains("tensor_cores"));
481 }
482
483 #[test]
484 fn launch_envelope_records_shape_residency_and_stable_message() {
485 let envelope = diagnose_cuda_kernel_launch_shape(
486 "frontier",
487 device(),
488 CudaKernelLaunchShape {
489 grid: [9, 2, 1],
490 block: [128, 2, 1],
491 dynamic_shared_memory_bytes: 32_768,
492 cooperative: true,
493 requires_tensor_cores: true,
494 },
495 Some(16),
496 )
497 .expect("Fix: valid CUDA launch envelope should derive");
498
499 assert_eq!(envelope.grid_blocks, 18);
500 assert_eq!(envelope.threads_per_block, 256);
501 assert!(!envelope.is_launchable());
502 let message = envelope.stable_message();
503 assert!(message.contains("cuda-kernel-capability-v1|kernel=frontier"));
504 assert!(message.contains("grid_blocks=18"));
505 assert!(message.contains("threads_per_block=256"));
506 assert!(message.contains("cooperative_residency=blocked(required=18,limit=16)"));
507 }
508
509 #[test]
510 fn launch_envelope_rejects_thread_block_product_overflow() {
511 let error = diagnose_cuda_kernel_launch_shape(
512 "frontier",
513 device(),
514 CudaKernelLaunchShape {
515 grid: [1, 1, 1],
516 block: [u32::MAX, u32::MAX, 2],
517 dynamic_shared_memory_bytes: 0,
518 cooperative: false,
519 requires_tensor_cores: false,
520 },
521 None,
522 )
523 .expect_err("oversized CUDA block shape must fail before diagnostics");
524
525 assert!(error.fix.contains("threads per block"));
526 }
527
528 #[test]
529 fn diagnostic_scratch_reuses_failure_and_message_storage() {
530 let mut scratch = CudaKernelLaunchDiagnosticScratch::default();
531 let failures_ptr = {
532 let blocked = diagnose_cuda_kernel_launch_with_scratch(
533 "frontier",
534 CudaKernelDeviceEnvelope {
535 sm_major: 8,
536 sm_minor: 6,
537 max_threads_per_block: 512,
538 shared_memory_per_block_bytes: 16_384,
539 supports_cooperative_launch: false,
540 supports_tensor_cores: false,
541 },
542 CudaKernelRequirement {
543 min_sm_major: 9,
544 min_sm_minor: 0,
545 requested_threads_per_block: 1_024,
546 requested_shared_memory_bytes: 65_536,
547 requires_cooperative_launch: true,
548 requires_tensor_cores: true,
549 },
550 &mut scratch,
551 );
552 assert!(!blocked.is_launchable());
553 assert_eq!(blocked.failures.len(), 5);
554 blocked.failures.as_ptr()
555 };
556
557 let message = scratch.diagnose_stable_message(
558 "frontier",
559 CudaKernelDeviceEnvelope {
560 sm_major: 8,
561 sm_minor: 6,
562 max_threads_per_block: 512,
563 shared_memory_per_block_bytes: 16_384,
564 supports_cooperative_launch: false,
565 supports_tensor_cores: false,
566 },
567 CudaKernelRequirement {
568 min_sm_major: 9,
569 min_sm_minor: 0,
570 requested_threads_per_block: 1_024,
571 requested_shared_memory_bytes: 65_536,
572 requires_cooperative_launch: true,
573 requires_tensor_cores: true,
574 },
575 );
576 assert!(message.contains("status=blocked"));
577 let message_ptr = message.as_ptr();
578
579 let launchable_failures_ptr = {
580 let launchable = diagnose_cuda_kernel_launch_with_scratch(
581 "frontier",
582 device(),
583 CudaKernelRequirement {
584 min_sm_major: 9,
585 min_sm_minor: 0,
586 requested_threads_per_block: 256,
587 requested_shared_memory_bytes: 32_768,
588 requires_cooperative_launch: true,
589 requires_tensor_cores: true,
590 },
591 &mut scratch,
592 );
593 assert!(launchable.is_launchable());
594 launchable.failures.as_ptr()
595 };
596 assert_eq!(launchable_failures_ptr, failures_ptr);
597
598 let message = scratch.diagnose_stable_message(
599 "frontier",
600 device(),
601 CudaKernelRequirement {
602 min_sm_major: 9,
603 min_sm_minor: 0,
604 requested_threads_per_block: 256,
605 requested_shared_memory_bytes: 32_768,
606 requires_cooperative_launch: true,
607 requires_tensor_cores: true,
608 },
609 );
610
611 assert_eq!(
612 message,
613 "cuda-kernel-capability-v1|kernel=frontier|status=ok"
614 );
615 assert_eq!(
616 message.as_ptr(),
617 message_ptr,
618 "Fix: repeated CUDA launch diagnostics must reuse caller-owned message storage instead of allocating one string per failure and joining them."
619 );
620 }
621
622 fn device() -> CudaKernelDeviceEnvelope {
623 CudaKernelDeviceEnvelope {
624 sm_major: 12,
625 sm_minor: 0,
626 max_threads_per_block: 1_024,
627 shared_memory_per_block_bytes: 99_840,
628 supports_cooperative_launch: true,
629 supports_tensor_cores: true,
630 }
631 }
632}
633
634#[cfg(test)]
635
636mod owned_diagnostic_allocation_tests {
637 use super::*;
638
639 #[test]
640 fn owned_diagnostic_moves_failures_out_of_scratch_without_clone() {
641 let diagnostic = diagnose_cuda_kernel_launch(
642 "frontier",
643 CudaKernelDeviceEnvelope {
644 sm_major: 8,
645 sm_minor: 9,
646 max_threads_per_block: 512,
647 shared_memory_per_block_bytes: 32_768,
648 supports_cooperative_launch: false,
649 supports_tensor_cores: false,
650 },
651 CudaKernelRequirement {
652 min_sm_major: 9,
653 min_sm_minor: 0,
654 requested_threads_per_block: 1_024,
655 requested_shared_memory_bytes: 65_536,
656 requires_cooperative_launch: true,
657 requires_tensor_cores: true,
658 },
659 );
660
661 assert_eq!(diagnostic.failures.len(), 5);
662
663 let source = include_str!("kernel_failure_diagnostics.rs");
664 let production = source
665 .split("#[cfg(test)]")
666 .next()
667 .expect("Fix: CUDA diagnostic production source must be present before tests");
668 assert!(
669 !production.contains(".to_vec()"),
670 "Fix: owned CUDA launch diagnostics must move the scratch failure vector instead of cloning it."
671 );
672 assert!(
673 production.contains("use crate::numeric::CUDA_NUMERIC;")
674 && production.contains("CUDA_NUMERIC.checked_dim_product_u64(dims)")
675 && production.contains("CUDA_NUMERIC.checked_dim_product_u32(dims)")
676 && !production.contains(concat!(
677 "vyre_driver::numeric::",
678 "checked_dim_product"
679 )),
680 "Fix: CUDA launch-envelope dimension products must route through the shared CUDA numeric policy."
681 );
682 }
683}
684