Skip to main content

vyre_driver_cuda/
kernel_failure_diagnostics.rs

1//! Actionable CUDA kernel capability diagnostics.
2
3use crate::numeric::CUDA_NUMERIC;
4
5/// Device capabilities relevant to launch eligibility.
6#[derive(Clone, Copy, Debug, Eq, PartialEq)]
7pub struct CudaKernelDeviceEnvelope {
8    /// CUDA SM major version.
9    pub sm_major: u16,
10    /// CUDA SM minor version.
11    pub sm_minor: u16,
12    /// Maximum threads per block.
13    pub max_threads_per_block: u32,
14    /// Available shared memory per block.
15    pub shared_memory_per_block_bytes: u64,
16    /// Whether cooperative grid launch is supported.
17    pub supports_cooperative_launch: bool,
18    /// Whether tensor-core lowering is supported.
19    pub supports_tensor_cores: bool,
20}
21
22/// Kernel requirements that must be met before launch.
23#[derive(Clone, Copy, Debug, Eq, PartialEq)]
24pub struct CudaKernelRequirement {
25    /// Minimum CUDA SM major version.
26    pub min_sm_major: u16,
27    /// Minimum CUDA SM minor version when major versions match.
28    pub min_sm_minor: u16,
29    /// Threads per block requested by the kernel.
30    pub requested_threads_per_block: u32,
31    /// Shared memory per block requested by the kernel.
32    pub requested_shared_memory_bytes: u64,
33    /// Whether the kernel requires cooperative launch.
34    pub requires_cooperative_launch: bool,
35    /// Whether the kernel requires tensor-core instructions.
36    pub requires_tensor_cores: bool,
37}
38
39/// CUDA launch shape requested by a runtime or generated launcher.
40#[derive(Clone, Copy, Debug, Eq, PartialEq)]
41pub struct CudaKernelLaunchShape {
42    /// Grid dimensions in CUDA blocks.
43    pub grid: [u32; 3],
44    /// Block dimensions in CUDA threads.
45    pub block: [u32; 3],
46    /// Dynamic shared-memory bytes requested at launch.
47    pub dynamic_shared_memory_bytes: u32,
48    /// Whether the launch uses the cooperative kernel ABI.
49    pub cooperative: bool,
50    /// Whether the kernel requires tensor-core instructions.
51    pub requires_tensor_cores: bool,
52}
53
54/// Release-path launch envelope: device caps, requested shape, derived
55/// residency numbers, and capability diagnostics in one stable record.
56#[derive(Clone, Debug, Eq, PartialEq)]
57pub struct CudaKernelLaunchEnvelope {
58    /// Kernel label supplied by the caller.
59    pub kernel: &'static str,
60    /// Probed device capability envelope.
61    pub device: CudaKernelDeviceEnvelope,
62    /// Kernel requirement derived from the requested launch shape.
63    pub requirement: CudaKernelRequirement,
64    /// Original launch shape.
65    pub shape: CudaKernelLaunchShape,
66    /// Exact grid block count.
67    pub grid_blocks: u64,
68    /// Exact CUDA thread count per block.
69    pub threads_per_block: u32,
70    /// Cooperative resident block limit when the cooperative ABI is used.
71    pub cooperative_resident_block_limit: Option<u64>,
72    /// Capability diagnostic for the launch.
73    pub diagnostic: CudaKernelLaunchDiagnostic,
74}
75
76impl CudaKernelLaunchEnvelope {
77    /// Return true when the launch is eligible on the device envelope.
78    #[must_use]
79    pub fn is_launchable(&self) -> bool {
80        self.diagnostic.is_launchable()
81            && self
82                .cooperative_resident_block_limit
83                .is_none_or(|limit| self.grid_blocks <= limit)
84    }
85
86    /// Stable one-line release diagnostic including shape and residency.
87    #[must_use]
88    pub fn stable_message(&self) -> String {
89        let mut message = self.diagnostic.stable_message();
90        push_launch_envelope_suffix(self, &mut message);
91        message
92    }
93}
94
95/// Error while deriving a CUDA launch envelope.
96#[derive(Clone, Debug, Eq, PartialEq)]
97pub struct CudaKernelLaunchEnvelopeError {
98    /// Actionable fix message.
99    pub fix: String,
100}
101
102impl std::fmt::Display for CudaKernelLaunchEnvelopeError {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        f.write_str(&self.fix)
105    }
106}
107
108impl std::error::Error for CudaKernelLaunchEnvelopeError {}
109
110/// Capability failure reason for one CUDA launch.
111#[derive(Clone, Debug, Eq, PartialEq)]
112pub enum CudaKernelCapabilityFailure {
113    /// Device SM version is below the kernel requirement.
114    SmVersion {
115        /// Required major version.
116        required_major: u16,
117        /// Required minor version.
118        required_minor: u16,
119        /// Actual major version.
120        actual_major: u16,
121        /// Actual minor version.
122        actual_minor: u16,
123    },
124    /// Requested block size exceeds the device limit.
125    ThreadsPerBlock {
126        /// Requested threads per block.
127        requested: u32,
128        /// Device maximum.
129        maximum: u32,
130    },
131    /// Requested shared memory exceeds the device limit.
132    SharedMemory {
133        /// Requested shared memory bytes.
134        requested: u64,
135        /// Device maximum.
136        maximum: u64,
137    },
138    /// Cooperative launch is required but unsupported.
139    CooperativeLaunch,
140    /// Tensor cores are required but unsupported.
141    TensorCores,
142}
143
144/// Launch diagnostic with all missing CUDA requirements.
145#[derive(Clone, Debug, Eq, PartialEq)]
146pub struct CudaKernelLaunchDiagnostic {
147    /// Kernel label supplied by the caller.
148    pub kernel: &'static str,
149    /// Missing or invalid requirements.
150    pub failures: Vec<CudaKernelCapabilityFailure>,
151}
152
153impl CudaKernelLaunchDiagnostic {
154    /// Return true when every requirement is satisfied.
155    #[must_use]
156    pub fn is_launchable(&self) -> bool {
157        self.failures.is_empty()
158    }
159
160    /// Stable single-line diagnostic for release logs.
161    #[must_use]
162    pub fn stable_message(&self) -> String {
163        let mut message = String::new();
164        write_stable_message(self.kernel, &self.failures, &mut message);
165        message
166    }
167
168    /// Write the stable single-line diagnostic into caller-owned storage.
169    pub fn stable_message_into(&self, out: &mut String) {
170        write_stable_message(self.kernel, &self.failures, out);
171    }
172}
173
174/// Caller-owned scratch for repeated CUDA launch diagnostics.
175#[derive(Debug, Default)]
176pub struct CudaKernelLaunchDiagnosticScratch {
177    failures: Vec<CudaKernelCapabilityFailure>,
178    message: String,
179}
180
181impl CudaKernelLaunchDiagnosticScratch {
182    /// Diagnose and build the stable single-line diagnostic in reusable storage.
183    pub fn diagnose_stable_message(
184        &mut self,
185        kernel: &'static str,
186        device: CudaKernelDeviceEnvelope,
187        requirement: CudaKernelRequirement,
188    ) -> &str {
189        record_cuda_kernel_launch_failures(device, requirement, &mut self.failures);
190        write_stable_message(kernel, &self.failures, &mut self.message);
191        &self.message
192    }
193
194    /// Build the stable single-line diagnostic for caller-owned failures.
195    pub fn stable_message_for_failures(
196        &mut self,
197        kernel: &'static str,
198        failures: &[CudaKernelCapabilityFailure],
199    ) -> &str {
200        write_stable_message(kernel, failures, &mut self.message);
201        &self.message
202    }
203}
204
205/// Borrowed launch diagnostic backed by caller-owned scratch.
206#[derive(Clone, Copy, Debug, Eq, PartialEq)]
207pub struct CudaKernelLaunchDiagnosticRef<'a> {
208    /// Kernel label supplied by the caller.
209    pub kernel: &'static str,
210    /// Missing or invalid requirements.
211    pub failures: &'a [CudaKernelCapabilityFailure],
212}
213
214impl CudaKernelLaunchDiagnosticRef<'_> {
215    /// Return true when every requirement is satisfied.
216    #[must_use]
217    pub fn is_launchable(&self) -> bool {
218        self.failures.is_empty()
219    }
220}
221
222/// Diagnose whether a CUDA kernel can launch on the selected device.
223#[must_use]
224pub fn diagnose_cuda_kernel_launch(
225    kernel: &'static str,
226    device: CudaKernelDeviceEnvelope,
227    requirement: CudaKernelRequirement,
228) -> CudaKernelLaunchDiagnostic {
229    let mut scratch = CudaKernelLaunchDiagnosticScratch::default();
230    let kernel = {
231        let diagnostic =
232            diagnose_cuda_kernel_launch_with_scratch(kernel, device, requirement, &mut scratch);
233        diagnostic.kernel
234    };
235    CudaKernelLaunchDiagnostic {
236        kernel,
237        failures: scratch.failures,
238    }
239}
240
241/// Build a release-path CUDA launch envelope from probed device caps and a
242/// requested launch shape.
243///
244/// # Errors
245///
246/// Returns [`CudaKernelLaunchEnvelopeError`] when grid or block products
247/// overflow the release diagnostic fields.
248pub fn diagnose_cuda_kernel_launch_shape(
249    kernel: &'static str,
250    device: CudaKernelDeviceEnvelope,
251    shape: CudaKernelLaunchShape,
252    cooperative_resident_block_limit: Option<u64>,
253) -> Result<CudaKernelLaunchEnvelope, CudaKernelLaunchEnvelopeError> {
254    let grid_blocks = checked_dim_product_u64(shape.grid, "grid block count")?;
255    let threads_per_block = checked_dim_product_u32(shape.block, "threads per block")?;
256    let requirement = CudaKernelRequirement {
257        min_sm_major: 0,
258        min_sm_minor: 0,
259        requested_threads_per_block: threads_per_block,
260        requested_shared_memory_bytes: u64::from(shape.dynamic_shared_memory_bytes),
261        requires_cooperative_launch: shape.cooperative,
262        requires_tensor_cores: shape.requires_tensor_cores,
263    };
264    let diagnostic = diagnose_cuda_kernel_launch(kernel, device, requirement);
265    Ok(CudaKernelLaunchEnvelope {
266        kernel,
267        device,
268        requirement,
269        shape,
270        grid_blocks,
271        threads_per_block,
272        cooperative_resident_block_limit,
273        diagnostic,
274    })
275}
276
277/// Diagnose whether a CUDA kernel can launch using caller-owned scratch.
278pub fn diagnose_cuda_kernel_launch_with_scratch<'a>(
279    kernel: &'static str,
280    device: CudaKernelDeviceEnvelope,
281    requirement: CudaKernelRequirement,
282    scratch: &'a mut CudaKernelLaunchDiagnosticScratch,
283) -> CudaKernelLaunchDiagnosticRef<'a> {
284    record_cuda_kernel_launch_failures(device, requirement, &mut scratch.failures);
285
286    CudaKernelLaunchDiagnosticRef {
287        kernel,
288        failures: &scratch.failures,
289    }
290}
291
292fn record_cuda_kernel_launch_failures(
293    device: CudaKernelDeviceEnvelope,
294    requirement: CudaKernelRequirement,
295    failures: &mut Vec<CudaKernelCapabilityFailure>,
296) {
297    failures.clear();
298    if (device.sm_major, device.sm_minor) < (requirement.min_sm_major, requirement.min_sm_minor) {
299        failures.push(CudaKernelCapabilityFailure::SmVersion {
300            required_major: requirement.min_sm_major,
301            required_minor: requirement.min_sm_minor,
302            actual_major: device.sm_major,
303            actual_minor: device.sm_minor,
304        });
305    }
306    if requirement.requested_threads_per_block > device.max_threads_per_block {
307        failures.push(CudaKernelCapabilityFailure::ThreadsPerBlock {
308            requested: requirement.requested_threads_per_block,
309            maximum: device.max_threads_per_block,
310        });
311    }
312    if requirement.requested_shared_memory_bytes > device.shared_memory_per_block_bytes {
313        failures.push(CudaKernelCapabilityFailure::SharedMemory {
314            requested: requirement.requested_shared_memory_bytes,
315            maximum: device.shared_memory_per_block_bytes,
316        });
317    }
318    if requirement.requires_cooperative_launch && !device.supports_cooperative_launch {
319        failures.push(CudaKernelCapabilityFailure::CooperativeLaunch);
320    }
321    if requirement.requires_tensor_cores && !device.supports_tensor_cores {
322        failures.push(CudaKernelCapabilityFailure::TensorCores);
323    }
324}
325
326fn write_stable_message(
327    kernel: &'static str,
328    failures: &[CudaKernelCapabilityFailure],
329    out: &mut String,
330) {
331    use std::fmt::Write as _;
332
333    out.clear();
334    let _ = write!(out, "cuda-kernel-capability-v1|kernel={kernel}|status=");
335    if failures.is_empty() {
336        out.push_str("ok");
337        return;
338    }
339    out.push_str("blocked|fix=");
340    for (index, failure) in failures.iter().enumerate() {
341        if index > 0 {
342            out.push(',');
343        }
344        match failure {
345            CudaKernelCapabilityFailure::SmVersion {
346                required_major,
347                required_minor,
348                actual_major,
349                actual_minor,
350            } => {
351                let _ = write!(
352                    out,
353                    "sm_version(required={required_major}.{required_minor},actual={actual_major}.{actual_minor})"
354                );
355            }
356            CudaKernelCapabilityFailure::ThreadsPerBlock { requested, maximum } => {
357                let _ = write!(
358                    out,
359                    "threads_per_block(requested={requested},max={maximum})"
360                );
361            }
362            CudaKernelCapabilityFailure::SharedMemory { requested, maximum } => {
363                let _ = write!(out, "shared_memory(requested={requested},max={maximum})");
364            }
365            CudaKernelCapabilityFailure::CooperativeLaunch => out.push_str("cooperative_launch"),
366            CudaKernelCapabilityFailure::TensorCores => out.push_str("tensor_cores"),
367        }
368    }
369}
370
371fn push_launch_envelope_suffix(envelope: &CudaKernelLaunchEnvelope, out: &mut String) {
372    use std::fmt::Write as _;
373
374    let _ = write!(
375        out,
376        "|grid={:?}|block={:?}|grid_blocks={}|threads_per_block={}|dynamic_shared_bytes={}",
377        envelope.shape.grid,
378        envelope.shape.block,
379        envelope.grid_blocks,
380        envelope.threads_per_block,
381        envelope.shape.dynamic_shared_memory_bytes
382    );
383    if let Some(limit) = envelope.cooperative_resident_block_limit {
384        let _ = write!(out, "|cooperative_resident_block_limit={limit}");
385        if envelope.grid_blocks > limit {
386            let _ = write!(
387                out,
388                "|cooperative_residency=blocked(required={},limit={})",
389                envelope.grid_blocks, limit
390            );
391        }
392    }
393}
394
395fn checked_dim_product_u64(
396    dims: [u32; 3],
397    label: &'static str,
398) -> Result<u64, CudaKernelLaunchEnvelopeError> {
399    CUDA_NUMERIC.checked_dim_product_u64(dims).ok_or_else(|| {
400        CudaKernelLaunchEnvelopeError {
401            fix: format!(
402                "CUDA launch envelope {label} overflowed u64 for dimensions {dims:?}. Fix: shard the launch before release diagnostics."
403            ),
404        }
405    })
406}
407
408fn checked_dim_product_u32(
409    dims: [u32; 3],
410    label: &'static str,
411) -> Result<u32, CudaKernelLaunchEnvelopeError> {
412    CUDA_NUMERIC.checked_dim_product_u32(dims).ok_or_else(|| {
413        let product = checked_dim_product_u64(dims, label).map_or_else(
414            |_| "overflowed u64".to_string(),
415            |value| value.to_string(),
416        );
417        CudaKernelLaunchEnvelopeError {
418            fix: format!(
419                "CUDA launch envelope {label} value {product} cannot fit u32. Fix: lower block dimensions before launch."
420            ),
421        }
422    })
423}
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428
429    #[test]
430    fn diagnostic_accepts_satisfied_kernel_requirements() {
431        let diagnostic = diagnose_cuda_kernel_launch(
432            "frontier",
433            device(),
434            CudaKernelRequirement {
435                min_sm_major: 9,
436                min_sm_minor: 0,
437                requested_threads_per_block: 256,
438                requested_shared_memory_bytes: 32_768,
439                requires_cooperative_launch: true,
440                requires_tensor_cores: true,
441            },
442        );
443
444        assert!(diagnostic.is_launchable());
445        assert_eq!(
446            diagnostic.stable_message(),
447            "cuda-kernel-capability-v1|kernel=frontier|status=ok"
448        );
449    }
450
451    #[test]
452    fn diagnostic_reports_every_missing_requirement() {
453        let diagnostic = diagnose_cuda_kernel_launch(
454            "frontier",
455            CudaKernelDeviceEnvelope {
456                sm_major: 8,
457                sm_minor: 6,
458                max_threads_per_block: 512,
459                shared_memory_per_block_bytes: 16_384,
460                supports_cooperative_launch: false,
461                supports_tensor_cores: false,
462            },
463            CudaKernelRequirement {
464                min_sm_major: 9,
465                min_sm_minor: 0,
466                requested_threads_per_block: 1_024,
467                requested_shared_memory_bytes: 65_536,
468                requires_cooperative_launch: true,
469                requires_tensor_cores: true,
470            },
471        );
472
473        assert!(!diagnostic.is_launchable());
474        assert_eq!(diagnostic.failures.len(), 5);
475        let message = diagnostic.stable_message();
476        assert!(message.contains("sm_version(required=9.0,actual=8.6)"));
477        assert!(message.contains("threads_per_block(requested=1024,max=512)"));
478        assert!(message.contains("shared_memory(requested=65536,max=16384)"));
479        assert!(message.contains("cooperative_launch"));
480        assert!(message.contains("tensor_cores"));
481    }
482
483    #[test]
484    fn launch_envelope_records_shape_residency_and_stable_message() {
485        let envelope = diagnose_cuda_kernel_launch_shape(
486            "frontier",
487            device(),
488            CudaKernelLaunchShape {
489                grid: [9, 2, 1],
490                block: [128, 2, 1],
491                dynamic_shared_memory_bytes: 32_768,
492                cooperative: true,
493                requires_tensor_cores: true,
494            },
495            Some(16),
496        )
497        .expect("Fix: valid CUDA launch envelope should derive");
498
499        assert_eq!(envelope.grid_blocks, 18);
500        assert_eq!(envelope.threads_per_block, 256);
501        assert!(!envelope.is_launchable());
502        let message = envelope.stable_message();
503        assert!(message.contains("cuda-kernel-capability-v1|kernel=frontier"));
504        assert!(message.contains("grid_blocks=18"));
505        assert!(message.contains("threads_per_block=256"));
506        assert!(message.contains("cooperative_residency=blocked(required=18,limit=16)"));
507    }
508
509    #[test]
510    fn launch_envelope_rejects_thread_block_product_overflow() {
511        let error = diagnose_cuda_kernel_launch_shape(
512            "frontier",
513            device(),
514            CudaKernelLaunchShape {
515                grid: [1, 1, 1],
516                block: [u32::MAX, u32::MAX, 2],
517                dynamic_shared_memory_bytes: 0,
518                cooperative: false,
519                requires_tensor_cores: false,
520            },
521            None,
522        )
523        .expect_err("oversized CUDA block shape must fail before diagnostics");
524
525        assert!(error.fix.contains("threads per block"));
526    }
527
528    #[test]
529    fn diagnostic_scratch_reuses_failure_and_message_storage() {
530        let mut scratch = CudaKernelLaunchDiagnosticScratch::default();
531        let failures_ptr = {
532            let blocked = diagnose_cuda_kernel_launch_with_scratch(
533                "frontier",
534                CudaKernelDeviceEnvelope {
535                    sm_major: 8,
536                    sm_minor: 6,
537                    max_threads_per_block: 512,
538                    shared_memory_per_block_bytes: 16_384,
539                    supports_cooperative_launch: false,
540                    supports_tensor_cores: false,
541                },
542                CudaKernelRequirement {
543                    min_sm_major: 9,
544                    min_sm_minor: 0,
545                    requested_threads_per_block: 1_024,
546                    requested_shared_memory_bytes: 65_536,
547                    requires_cooperative_launch: true,
548                    requires_tensor_cores: true,
549                },
550                &mut scratch,
551            );
552            assert!(!blocked.is_launchable());
553            assert_eq!(blocked.failures.len(), 5);
554            blocked.failures.as_ptr()
555        };
556
557        let message = scratch.diagnose_stable_message(
558            "frontier",
559            CudaKernelDeviceEnvelope {
560                sm_major: 8,
561                sm_minor: 6,
562                max_threads_per_block: 512,
563                shared_memory_per_block_bytes: 16_384,
564                supports_cooperative_launch: false,
565                supports_tensor_cores: false,
566            },
567            CudaKernelRequirement {
568                min_sm_major: 9,
569                min_sm_minor: 0,
570                requested_threads_per_block: 1_024,
571                requested_shared_memory_bytes: 65_536,
572                requires_cooperative_launch: true,
573                requires_tensor_cores: true,
574            },
575        );
576        assert!(message.contains("status=blocked"));
577        let message_ptr = message.as_ptr();
578
579        let launchable_failures_ptr = {
580            let launchable = diagnose_cuda_kernel_launch_with_scratch(
581                "frontier",
582                device(),
583                CudaKernelRequirement {
584                    min_sm_major: 9,
585                    min_sm_minor: 0,
586                    requested_threads_per_block: 256,
587                    requested_shared_memory_bytes: 32_768,
588                    requires_cooperative_launch: true,
589                    requires_tensor_cores: true,
590                },
591                &mut scratch,
592            );
593            assert!(launchable.is_launchable());
594            launchable.failures.as_ptr()
595        };
596        assert_eq!(launchable_failures_ptr, failures_ptr);
597
598        let message = scratch.diagnose_stable_message(
599            "frontier",
600            device(),
601            CudaKernelRequirement {
602                min_sm_major: 9,
603                min_sm_minor: 0,
604                requested_threads_per_block: 256,
605                requested_shared_memory_bytes: 32_768,
606                requires_cooperative_launch: true,
607                requires_tensor_cores: true,
608            },
609        );
610
611        assert_eq!(
612            message,
613            "cuda-kernel-capability-v1|kernel=frontier|status=ok"
614        );
615        assert_eq!(
616            message.as_ptr(),
617            message_ptr,
618            "Fix: repeated CUDA launch diagnostics must reuse caller-owned message storage instead of allocating one string per failure and joining them."
619        );
620    }
621
622    fn device() -> CudaKernelDeviceEnvelope {
623        CudaKernelDeviceEnvelope {
624            sm_major: 12,
625            sm_minor: 0,
626            max_threads_per_block: 1_024,
627            shared_memory_per_block_bytes: 99_840,
628            supports_cooperative_launch: true,
629            supports_tensor_cores: true,
630        }
631    }
632}
633
634#[cfg(test)]
635
636mod owned_diagnostic_allocation_tests {
637    use super::*;
638
639    #[test]
640    fn owned_diagnostic_moves_failures_out_of_scratch_without_clone() {
641        let diagnostic = diagnose_cuda_kernel_launch(
642            "frontier",
643            CudaKernelDeviceEnvelope {
644                sm_major: 8,
645                sm_minor: 9,
646                max_threads_per_block: 512,
647                shared_memory_per_block_bytes: 32_768,
648                supports_cooperative_launch: false,
649                supports_tensor_cores: false,
650            },
651            CudaKernelRequirement {
652                min_sm_major: 9,
653                min_sm_minor: 0,
654                requested_threads_per_block: 1_024,
655                requested_shared_memory_bytes: 65_536,
656                requires_cooperative_launch: true,
657                requires_tensor_cores: true,
658            },
659        );
660
661        assert_eq!(diagnostic.failures.len(), 5);
662
663        let source = include_str!("kernel_failure_diagnostics.rs");
664        let production = source
665            .split("#[cfg(test)]")
666            .next()
667            .expect("Fix: CUDA diagnostic production source must be present before tests");
668        assert!(
669            !production.contains(".to_vec()"),
670            "Fix: owned CUDA launch diagnostics must move the scratch failure vector instead of cloning it."
671        );
672        assert!(
673            production.contains("use crate::numeric::CUDA_NUMERIC;")
674                && production.contains("CUDA_NUMERIC.checked_dim_product_u64(dims)")
675                && production.contains("CUDA_NUMERIC.checked_dim_product_u32(dims)")
676                && !production.contains(concat!(
677                    "vyre_driver::numeric::",
678                    "checked_dim_product"
679                )),
680            "Fix: CUDA launch-envelope dimension products must route through the shared CUDA numeric policy."
681        );
682    }
683}
684