Skip to main content

vyre_driver/backend/
vyre_backend.rs

1//! The frozen `VyreBackend` contract.
2
3use std::sync::Arc;
4
5use smallvec::SmallVec;
6use vyre_foundation::ir::Program;
7
8use crate::backend::{
9    device_buffer::unsupported_device_buffer, private, BackendError, CompiledPipeline,
10    DeviceBuffer, DispatchConfig, OutputBuffers, PendingDispatch, Resource, TimedDispatchResult,
11};
12
13/// One backend-resident program dispatch in an ordered sequence.
14pub struct ResidentDispatchStep<'a> {
15    /// Program to dispatch.
16    pub program: &'a Program,
17    /// Resident resources in binding order.
18    pub resources: &'a [Resource],
19    /// Optional CUDA/grid-style launch override.
20    pub grid_override: Option<[u32; 3]>,
21}
22
23/// One compact byte range to read from a backend-resident resource.
24pub struct ResidentReadRange<'a> {
25    /// Resident resource to read.
26    pub resource: &'a Resource,
27    /// Byte offset inside the resident resource.
28    pub byte_offset: usize,
29    /// Number of bytes to read.
30    pub byte_len: usize,
31}
32
33/// Timing captured for an ordered resident dispatch sequence.
34#[derive(Clone, Debug, Default, Eq, PartialEq)]
35pub struct ResidentSequenceTiming {
36    /// Host-observed sequence duration including requested readbacks.
37    pub wall_ns: u64,
38    /// Device-observed elapsed dispatch time when the backend exposes timers.
39    pub device_ns: Option<u64>,
40    /// Host time spent enqueueing backend work before waiting.
41    pub enqueue_ns: Option<u64>,
42    /// Host time spent waiting for completion and collecting outputs.
43    pub wait_ns: Option<u64>,
44}
45
46/// The frozen contract between vyre and every execution backend.
47///
48/// A backend is a pure function from a validated `Program` and input buffers
49/// to output buffers. Implementations must be `Send + Sync`, deterministic
50/// for identical inputs, and byte-identical to the CPU reference on success.
51/// This trait is the keystone of the vyre abstraction thesis: frontends do
52/// not know which backend runs their IR, and backends do not know which
53/// frontend produced it.
54///
55/// # Examples
56///
57pub trait VyreBackend: private::Sealed + Send + Sync {
58    /// Stable backend identifier used for logging, certificates, and adapter selection.
59    ///
60    /// The identifier must be unique among all backends linked into the
61    /// current process. Conformance reports include this string so that
62    /// consumers know exactly which implementation was certified.
63    fn id(&self) -> &'static str;
64
65    /// Backend implementation version string used for certificates and
66    /// regression tracking.
67    ///
68    /// The default returns `"unspecified"`. Concrete backends should
69    /// override this with their crate version (e.g. `"0.4.0"`) so that
70    /// certificates can detect backend upgrades that may require re-cert.
71    fn version(&self) -> &'static str {
72        "unspecified"
73    }
74
75    /// Operation ids this backend can execute without further lowering.
76    fn supported_ops(&self) -> &std::collections::HashSet<vyre_foundation::ir::OpId> {
77        use crate::backend::validation::default_supported_ops;
78        default_supported_ops()
79    }
80
81    // Raw backend shader text is a concrete-driver implementation
82    // detail, not part of the substrate-neutral `VyreBackend`
83    // contract.
84
85    /// Executes the program with the given input buffers and returns the output buffers.
86    ///
87    /// On success the returned bytes must match the pure-Rust reference
88    /// implementation bit-for-bit. On failure the backend must return a
89    /// [`BackendError`] whose message contains an actionable `Fix: ` hint.
90    ///
91    /// # Examples
92    ///
93    /// ```no_run
94    /// use vyre::{Program, VyreBackend, DispatchConfig};
95    ///
96    /// # fn example(backend: &dyn VyreBackend, program: &Program) -> Result<Vec<Vec<u8>>, vyre::BackendError> {
97    /// let inputs = vec![vec![1u8, 2, 3]];
98    /// let config = DispatchConfig::default();
99    /// backend.dispatch(program, &inputs, &config)
100    /// # }
101    /// ```
102    ///
103    /// # Errors
104    ///
105    /// Returns [`BackendError`] when the backend cannot complete dispatch.
106    /// The error message always includes a `Fix: ` remediation section.
107    fn dispatch(
108        &self,
109        program: &Program,
110        inputs: &[Vec<u8>],
111        config: &DispatchConfig,
112    ) -> Result<Vec<Vec<u8>>, BackendError>;
113
114    /// Executes the program with borrowed input buffers.
115    ///
116    /// Backends may override this method to avoid staging borrowed bytes into
117    /// owned `Vec<u8>` buffers. The default is non-breaking: it performs one
118    /// owned vector allocation for the call and delegates to
119    /// [`VyreBackend::dispatch`].
120    ///
121    /// # Errors
122    ///
123    /// Returns [`BackendError`] when the backend cannot complete dispatch.
124    fn dispatch_borrowed(
125        &self,
126        program: &Program,
127        inputs: &[&[u8]],
128        config: &DispatchConfig,
129    ) -> Result<Vec<Vec<u8>>, BackendError> {
130        let owned =
131            crate::backend::clone_borrowed_inputs_for_dispatch(inputs, "backend input staging")?;
132        let outputs = self.dispatch(program, &owned, config)?;
133        crate::observability::record_dispatch_io(inputs, &outputs);
134        Ok(outputs)
135    }
136
137    /// Executes a borrowed-input dispatch and returns backend-owned timing.
138    ///
139    /// The default records host wall time and delegates to
140    /// [`VyreBackend::dispatch_borrowed`]. Device-specific backends override
141    /// this only inside their driver crates so benchmark crates never import
142    /// vendor APIs directly.
143    ///
144    /// # Errors
145    ///
146    /// Returns [`BackendError`] when the backend cannot complete dispatch.
147    fn dispatch_borrowed_timed(
148        &self,
149        program: &Program,
150        inputs: &[&[u8]],
151        config: &DispatchConfig,
152    ) -> Result<TimedDispatchResult, BackendError> {
153        let started = std::time::Instant::now();
154        let outputs = self.dispatch_borrowed(program, inputs, config)?;
155        Ok(TimedDispatchResult {
156            outputs,
157            wall_ns: crate::backend::checked_elapsed_wall_ns(started, "backend borrowed dispatch")?,
158            device_ns: None,
159            enqueue_ns: None,
160            wait_ns: None,
161        })
162    }
163
164    /// Executes the program with borrowed input buffers and writes outputs into
165    /// caller-owned storage.
166    ///
167    /// Backends may override this method to reuse output buffers across
168    /// dispatches. The default preserves the existing dispatch contract and
169    /// copies returned bytes into existing output slots where possible.
170    ///
171    /// # Errors
172    ///
173    /// Returns [`BackendError`] when the backend cannot complete dispatch.
174    fn dispatch_borrowed_into(
175        &self,
176        program: &Program,
177        inputs: &[&[u8]],
178        config: &DispatchConfig,
179        outputs: &mut OutputBuffers,
180    ) -> Result<(), BackendError> {
181        let result = self.dispatch_borrowed(program, inputs, config)?;
182        let stats = crate::backend::dispatch_result::replace_output_buffers_preserving_slots_with_memory_stats(
183            result,
184            outputs,
185        );
186        crate::observability::record_output_replacement_stats(stats);
187        Ok(())
188    }
189
190    /// Allocate a backend-resident buffer and return a stable resource handle.
191    ///
192    /// Backends that support resident resources override this method so callers
193    /// can keep hot inputs on the device without importing a concrete driver
194    /// crate. The returned [`Resource`] is only meaningful to the backend that
195    /// produced it.
196    ///
197    /// # Errors
198    ///
199    /// Returns [`BackendError`] when the backend cannot allocate a resident
200    /// resource of the requested size.
201    fn allocate_resident(&self, _byte_len: usize) -> Result<Resource, BackendError> {
202        Err(BackendError::UnsupportedFeature {
203            name: "resident buffer allocation".to_string(),
204            backend: self.id().to_string(),
205        })
206    }
207
208    /// Upload bytes into a backend-resident resource.
209    ///
210    /// # Errors
211    ///
212    /// Returns [`BackendError`] when the resource is not owned by this backend
213    /// or the byte length does not match the resident allocation.
214    fn upload_resident(&self, _resource: &Resource, _bytes: &[u8]) -> Result<(), BackendError> {
215        Err(BackendError::UnsupportedFeature {
216            name: "resident buffer upload".to_string(),
217            backend: self.id().to_string(),
218        })
219    }
220
221    /// Upload several backend-resident resources as one logical staging
222    /// operation.
223    ///
224    /// Backends that support resident graph/dataflow hot paths should override
225    /// this with a native batched transfer. The default fails loudly instead
226    /// of looping over [`VyreBackend::upload_resident`], because a hidden
227    /// per-buffer synchronization loop destroys the performance contract.
228    ///
229    /// # Errors
230    ///
231    /// Returns [`BackendError`] when the backend cannot batch resident uploads
232    /// or when any resource/byte length is invalid.
233    fn upload_resident_many(&self, _uploads: &[(&Resource, &[u8])]) -> Result<(), BackendError> {
234        Err(BackendError::UnsupportedFeature {
235            name: "resident buffer batch upload".to_string(),
236            backend: self.id().to_string(),
237        })
238    }
239
240    /// Upload bytes into a subrange of a backend-resident resource.
241    ///
242    /// This is the hot-loop path for reusable resident slots whose capacity is
243    /// larger than the current logical payload. Backends must not require the
244    /// upload length to equal the allocation length.
245    ///
246    /// # Errors
247    ///
248    /// Returns [`BackendError`] when ranged upload is unsupported, the resource
249    /// is not owned by this backend, or the destination range is out of bounds.
250    fn upload_resident_at(
251        &self,
252        _resource: &Resource,
253        _dst_offset_bytes: usize,
254        _bytes: &[u8],
255    ) -> Result<(), BackendError> {
256        Err(BackendError::UnsupportedFeature {
257            name: "resident buffer ranged upload".to_string(),
258            backend: self.id().to_string(),
259        })
260    }
261
262    /// Upload several resident subranges as one logical staging operation.
263    ///
264    /// The default fails loudly instead of looping over
265    /// [`VyreBackend::upload_resident_at`], because hidden per-range
266    /// synchronization breaks resident hot-loop performance.
267    ///
268    /// # Errors
269    ///
270    /// Returns [`BackendError`] when ranged batch upload is unsupported or when
271    /// any resource/range is invalid.
272    fn upload_resident_at_many(
273        &self,
274        _uploads: &[(&Resource, usize, &[u8])],
275    ) -> Result<(), BackendError> {
276        Err(BackendError::UnsupportedFeature {
277            name: "resident buffer ranged batch upload".to_string(),
278            backend: self.id().to_string(),
279        })
280    }
281
282    /// Download a backend-resident resource into a new host buffer.
283    ///
284    /// Prefer [`VyreBackend::download_resident_into`] in hot loops so repeated
285    /// validation does not allocate a fresh `Vec` for every readback.
286    ///
287    /// # Errors
288    ///
289    /// Returns [`BackendError`] when the backend cannot download resident
290    /// resources or when `resource` is not owned by this backend.
291    fn download_resident(&self, resource: &Resource) -> Result<Vec<u8>, BackendError> {
292        let mut bytes = Vec::new();
293        self.download_resident_into(resource, &mut bytes)?;
294        Ok(bytes)
295    }
296
297    /// Download a backend-resident resource into caller-owned storage.
298    ///
299    /// Implementations must clear and reuse `out`; hidden compatibility
300    /// allocation defeats resident hot-loop validation.
301    ///
302    /// # Errors
303    ///
304    /// Returns [`BackendError`] when resident download is unsupported or when
305    /// `resource` is not owned by this backend.
306    fn download_resident_into(
307        &self,
308        _resource: &Resource,
309        _out: &mut Vec<u8>,
310    ) -> Result<(), BackendError> {
311        Err(BackendError::UnsupportedFeature {
312            name: "resident buffer download".to_string(),
313            backend: self.id().to_string(),
314        })
315    }
316
317    /// Download a byte range from a backend-resident resource into a new host
318    /// buffer.
319    ///
320    /// Prefer [`VyreBackend::download_resident_range_into`] in hot loops.
321    ///
322    /// # Errors
323    ///
324    /// Returns [`BackendError`] when resident ranged download is unsupported,
325    /// the range is invalid, or `resource` is not owned by this backend.
326    fn download_resident_range(
327        &self,
328        resource: &Resource,
329        byte_offset: usize,
330        byte_len: usize,
331    ) -> Result<Vec<u8>, BackendError> {
332        let mut bytes = Vec::new();
333        bytes.try_reserve_exact(byte_len).map_err(|error| {
334            BackendError::InvalidProgram {
335                fix: format!(
336                    "Fix: resident ranged download could not reserve {byte_len} output byte(s): {error}. Split the readback range before dispatch."
337                ),
338            }
339        })?;
340        self.download_resident_range_into(resource, byte_offset, byte_len, &mut bytes)?;
341        Ok(bytes)
342    }
343
344    /// Download a byte range from a backend-resident resource into
345    /// caller-owned storage.
346    ///
347    /// # Errors
348    ///
349    /// Returns [`BackendError`] when resident ranged download is unsupported,
350    /// the range is invalid, or `resource` is not owned by this backend.
351    fn download_resident_range_into(
352        &self,
353        _resource: &Resource,
354        _byte_offset: usize,
355        _byte_len: usize,
356        _out: &mut Vec<u8>,
357    ) -> Result<(), BackendError> {
358        Err(BackendError::UnsupportedFeature {
359            name: "resident buffer ranged download".to_string(),
360            backend: self.id().to_string(),
361        })
362    }
363
364    /// Download several byte ranges from backend-resident resources into
365    /// caller-owned storage as one logical readback operation.
366    ///
367    /// Backends with a real multi-readback path should override this to issue
368    /// all copies behind one backend synchronization boundary.
369    ///
370    /// # Errors
371    ///
372    /// Returns [`BackendError`] when counts do not match, any range is invalid,
373    /// or resident ranged download is unsupported.
374    fn download_resident_ranges_into(
375        &self,
376        ranges: &[(&Resource, usize, usize)],
377        outputs: &mut [&mut Vec<u8>],
378    ) -> Result<(), BackendError> {
379        if ranges.len() != outputs.len() {
380            return Err(BackendError::InvalidProgram {
381                fix: format!(
382                    "Fix: resident ranged batch download expected matching range/output counts but got {} range(s) and {} output(s).",
383                    ranges.len(),
384                    outputs.len()
385                ),
386            });
387        }
388        for ((resource, byte_offset, byte_len), output) in ranges.iter().zip(outputs.iter_mut()) {
389            self.download_resident_range_into(resource, *byte_offset, *byte_len, output)?;
390        }
391        Ok(())
392    }
393
394    /// Free a backend-resident resource previously returned by
395    /// [`VyreBackend::allocate_resident`].
396    ///
397    /// # Errors
398    ///
399    /// Returns [`BackendError`] when the resource is unknown or still in use.
400    fn free_resident(&self, _resource: Resource) -> Result<(), BackendError> {
401        Err(BackendError::UnsupportedFeature {
402            name: "resident buffer free".to_string(),
403            backend: self.id().to_string(),
404        })
405    }
406
407    /// Dispatch using backend-resident resources and return backend-owned
408    /// timing.
409    ///
410    /// # Errors
411    ///
412    /// Returns [`BackendError`] when the backend does not support resident
413    /// dispatch or any resource is invalid for the program.
414    fn dispatch_resident_timed(
415        &self,
416        _program: &Program,
417        _resources: &[Resource],
418        _config: &DispatchConfig,
419    ) -> Result<TimedDispatchResult, BackendError> {
420        Err(BackendError::UnsupportedFeature {
421            name: "resident timed dispatch".to_string(),
422            backend: self.id().to_string(),
423        })
424    }
425
426    /// Dispatch an ordered sequence of resident-buffer programs and read
427    /// selected resident byte ranges into caller-owned storage.
428    ///
429    /// The default preserves correctness by dispatching each step through
430    /// [`VyreBackend::dispatch_resident_timed`] and then calling
431    /// [`VyreBackend::download_resident_ranges_into`]. CUDA overrides this to
432    /// enqueue the whole dependent chain plus D2H readbacks on one stream and
433    /// pay one host synchronization point.
434    ///
435    /// # Errors
436    ///
437    /// Returns [`BackendError`] when any step fails, when readback ranges are
438    /// invalid, or when the backend cannot perform resident dispatch/readback.
439    fn dispatch_resident_sequence_read_ranges_into(
440        &self,
441        steps: &[ResidentDispatchStep<'_>],
442        read_ranges: &[ResidentReadRange<'_>],
443        outputs: &mut [&mut Vec<u8>],
444    ) -> Result<(), BackendError> {
445        for step in steps {
446            let mut config = DispatchConfig::default();
447            config.grid_override = step.grid_override;
448            self.dispatch_resident_timed(step.program, step.resources, &config)?;
449        }
450        let ranges = read_ranges
451            .iter()
452            .map(|range| (range.resource, range.byte_offset, range.byte_len))
453            .collect::<SmallVec<[_; 8]>>();
454        self.download_resident_ranges_into(&ranges, outputs)
455    }
456
457    /// Timed variant of
458    /// [`VyreBackend::dispatch_resident_sequence_read_ranges_into`].
459    ///
460    /// The default preserves correctness by summing each step's resident
461    /// dispatch timing and then downloading the requested ranges. Backends with
462    /// a fused resident sequence path should override this to keep their
463    /// optimized stream/queue behavior while exposing device timing.
464    ///
465    /// # Errors
466    ///
467    /// Returns [`BackendError`] when any step fails, when readback ranges are
468    /// invalid, or when resident sequence timing overflows.
469    fn dispatch_resident_sequence_read_ranges_timed_into(
470        &self,
471        steps: &[ResidentDispatchStep<'_>],
472        read_ranges: &[ResidentReadRange<'_>],
473        outputs: &mut [&mut Vec<u8>],
474    ) -> Result<ResidentSequenceTiming, BackendError> {
475        let started = std::time::Instant::now();
476        let mut device_ns = Some(0_u64);
477        let mut enqueue_ns = Some(0_u64);
478        let mut wait_ns = Some(0_u64);
479        for step in steps {
480            let mut config = DispatchConfig::default();
481            config.grid_override = step.grid_override;
482            let timed = self.dispatch_resident_timed(step.program, step.resources, &config)?;
483            device_ns = sum_optional_timing(device_ns, timed.device_ns, "device timing")?;
484            enqueue_ns = sum_optional_timing(enqueue_ns, timed.enqueue_ns, "enqueue timing")?;
485            wait_ns = sum_optional_timing(wait_ns, timed.wait_ns, "wait timing")?;
486        }
487        let ranges = read_ranges
488            .iter()
489            .map(|range| (range.resource, range.byte_offset, range.byte_len))
490            .collect::<SmallVec<[_; 8]>>();
491        self.download_resident_ranges_into(&ranges, outputs)?;
492        Ok(ResidentSequenceTiming {
493            wall_ns: elapsed_resident_sequence_wall_ns(started)?,
494            device_ns,
495            enqueue_ns,
496            wait_ns,
497        })
498    }
499
500    /// Dispatch a resident prefix, repeat a resident sub-sequence, and read
501    /// selected resident byte ranges into caller-owned storage.
502    ///
503    /// This is the fixed-point hot-path contract: dataflow clients can express
504    /// a repeated kernel group without allocating one host sequence entry per
505    /// iteration. Backends that understand repetition should override this to
506    /// keep launch preparation and parameter upload sublinear in
507    /// `repeat_count`.
508    ///
509    /// # Errors
510    ///
511    /// Returns [`BackendError`] when any step fails, when readback ranges are
512    /// invalid, or when the backend cannot perform resident dispatch/readback.
513    fn dispatch_resident_repeated_sequence_read_ranges_into(
514        &self,
515        prefix_steps: &[ResidentDispatchStep<'_>],
516        repeated_steps: &[ResidentDispatchStep<'_>],
517        repeat_count: u32,
518        read_ranges: &[ResidentReadRange<'_>],
519        outputs: &mut [&mut Vec<u8>],
520    ) -> Result<(), BackendError> {
521        for step in prefix_steps {
522            let mut config = DispatchConfig::default();
523            config.grid_override = step.grid_override;
524            self.dispatch_resident_timed(step.program, step.resources, &config)?;
525        }
526        for _ in 0..repeat_count {
527            for step in repeated_steps {
528                let mut config = DispatchConfig::default();
529                config.grid_override = step.grid_override;
530                self.dispatch_resident_timed(step.program, step.resources, &config)?;
531            }
532        }
533        let ranges = read_ranges
534            .iter()
535            .map(|range| (range.resource, range.byte_offset, range.byte_len))
536            .collect::<SmallVec<[_; 8]>>();
537        self.download_resident_ranges_into(&ranges, outputs)
538    }
539
540    /// Optional pre-compilation hook for the pipeline-mode API.
541    ///
542    /// Default returns `Ok(None)`  -  the framework wraps in a passthrough
543    /// pipeline whose `dispatch` calls back into [`VyreBackend::dispatch`]
544    /// every time. Backends that genuinely cache compiled state (compute
545    /// pipeline, bind-group layout, lowered shader text) override this and
546    /// return `Ok(Some(...))` so repeated dispatches skip the compilation
547    /// overhead.
548    ///
549    /// The returned pipeline MUST be bit-identical to repeated
550    /// `dispatch(program, inputs, config)` for the program it was compiled
551    /// from. The cache key is the backend's responsibility  -  the framework
552    /// does not deduplicate compile calls.
553    ///
554    /// Implementing this method is the P-6 contract from
555    /// `docs/audits/ROADMAP_PERFORMANCE.md`: "compile target-text + pipeline +
556    /// bind-group-layout once; dispatch repeatedly with different inputs."
557    ///
558    /// # Errors
559    ///
560    /// Returns [`BackendError`] when the backend cannot complete the
561    /// pre-compilation. Callers should treat this as fatal for the program
562    /// (the program will not dispatch successfully via any path).
563    fn compile_native(
564        &self,
565        _program: &Program,
566        _config: &DispatchConfig,
567    ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
568        Ok(None)
569    }
570
571    /// Optional pre-compilation hook for callers that already own a shared
572    /// program allocation.
573    ///
574    /// Backends that store the program inside the compiled pipeline should
575    /// override this method and keep the supplied [`Arc<Program>`] instead of
576    /// cloning the IR. The default preserves the older borrowed-program hook
577    /// for backends that only inspect the program while compiling.
578    ///
579    /// # Errors
580    ///
581    /// Returns [`BackendError`] when backend-native compilation fails.
582    fn compile_native_shared(
583        &self,
584        program: Arc<Program>,
585        config: &DispatchConfig,
586    ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
587        self.compile_native(&program, config)
588    }
589
590    /// Optional compiled-pipeline cache counters for compile telemetry.
591    ///
592    /// Return `None` unless the backend can report real cache hits and misses.
593    fn pipeline_cache_snapshot(&self) -> Option<crate::pipeline::PipelineCacheSnapshot> {
594        None
595    }
596
597    /// Optional backend-specific numeric telemetry for release evidence.
598    ///
599    /// Return an empty vector unless the backend can report real counters.
600    /// Metric names must be stable ASCII identifiers suitable for JSON and
601    /// Prometheus-style export.
602    fn backend_metric_snapshot(&self) -> Vec<(&'static str, u64)> {
603        Vec::new()
604    }
605
606    /// Non-blocking dispatch primitive.
607    ///
608    /// Returns a [`PendingDispatch`] handle immediately; the caller
609    /// polls via [`PendingDispatch::is_ready`] and consumes the result
610    /// via [`PendingDispatch::await_result`]. Backends that genuinely
611    /// pipeline dispatches override this so N concurrent dispatches
612    /// do not serialize on the host.
613    ///
614    /// Default: run the synchronous [`VyreBackend::dispatch`] path and
615    /// wrap the result in a trivially-ready handle. This keeps every
616    /// backend useful from the async API without forcing an async
617    /// rewrite.
618    ///
619    /// # Errors
620    ///
621    /// Returns [`BackendError`] if the dispatch cannot start. Errors
622    /// that surface only during GPU execution come back through
623    /// [`PendingDispatch::await_result`], not from this call.
624    fn dispatch_async(
625        &self,
626        program: &Program,
627        inputs: &[Vec<u8>],
628        config: &DispatchConfig,
629    ) -> Result<Box<dyn PendingDispatch>, BackendError> {
630        let outputs = self.dispatch(program, inputs, config)?;
631        Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
632            outputs,
633        }))
634    }
635
636    /// Non-blocking dispatch with borrowed input buffers.
637    ///
638    /// Backends that record GPU commands synchronously before returning can
639    /// override this to avoid cloning input buffers just to create a pending
640    /// handle. The returned [`PendingDispatch`] must not borrow from `inputs`.
641    ///
642    /// # Errors
643    ///
644    /// Returns [`BackendError`] if the dispatch cannot start.
645    fn dispatch_borrowed_async(
646        &self,
647        program: &Program,
648        inputs: &[&[u8]],
649        config: &DispatchConfig,
650    ) -> Result<Box<dyn PendingDispatch>, BackendError> {
651        let outputs = self.dispatch_borrowed(program, inputs, config)?;
652        Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
653            outputs,
654        }))
655    }
656
657    // ---------------------------------------------------------------
658    // Capability queries (all default to conservative "no" / minimal).
659    //
660    // These are the stable capability surface. Additional backends implement
661    // this trait by default-inheriting every capability below and OVERRIDING
662    // only the ones where they are more capable than the conservative floor.
663    // This means adding a backend is strictly additive  -  no existing
664    // backend impl has to change when a new capability query is added.
665    //
666    // Backends MUST report HONESTLY. Returning `true` from a capability
667    // query is a promise the lowering path emits the corresponding
668    // intrinsic and the adapter supports it. "Supported but broken" is a
669    // LAW 9 evasion (see AGENTS.md). If the feature bit is set on the
670    // device but the lowering emits a slower emulation sequence, the
671    // answer is `false` until the native lowering catches up.
672    // ---------------------------------------------------------------
673
674    /// Whether this backend's lowering path emits subgroup / wave
675    /// intrinsics AND the current adapter exposes them.
676    ///
677    /// Default: `false` (conservative  -  assumes no native subgroup lowering).
678    #[must_use]
679    fn supports_subgroup_ops(&self) -> bool {
680        false
681    }
682
683    /// Whether this backend lowers IEEE 754 binary16 (`DataType::F16`)
684    /// natively rather than emulating through `f32`.
685    ///
686    /// Default: `false`.
687    #[must_use]
688    fn supports_f16(&self) -> bool {
689        false
690    }
691
692    /// Whether this backend lowers bfloat16 (`DataType::BF16`) natively.
693    ///
694    /// Default: `false`.
695    #[must_use]
696    fn supports_bf16(&self) -> bool {
697        false
698    }
699
700    /// Whether this backend emits tensor-core / matrix-engine intrinsics
701    /// for supported tensor shapes.
702    ///
703    /// Default: `false`.
704    #[must_use]
705    fn supports_tensor_cores(&self) -> bool {
706        false
707    }
708
709    /// Whether this backend overlaps copies and compute via independent
710    /// queues or async engines.
711    ///
712    /// Default: `false` (host serializes copy ↔ compute).
713    #[must_use]
714    fn supports_async_compute(&self) -> bool {
715        false
716    }
717
718    /// Whether this backend supports indirect dispatch
719    /// (`Node::IndirectDispatch`).
720    ///
721    /// Default: `false`.
722    #[must_use]
723    fn supports_indirect_dispatch(&self) -> bool {
724        false
725    }
726
727    /// Whether this backend supports speculative dispatch  -  a fused
728    /// prefilter + confirmer kernel with commit-gated output and a
729    /// counter tail read back by the host.
730    ///
731    /// Default: `false`.
732    #[must_use]
733    fn supports_speculation(&self) -> bool {
734        false
735    }
736
737    /// Whether this backend supports device-side persistent-thread
738    /// dispatch (a long-running kernel that polls a work queue).
739    ///
740    /// Default: `false`.
741    #[must_use]
742    fn supports_persistent_thread_dispatch(&self) -> bool {
743        false
744    }
745
746    /// Whether this backend can satisfy `Node::Barrier { ordering:
747    /// MemoryOrdering::GridSync }` inside a single dispatch  -  i.e.
748    /// every thread in the entire grid waits at the barrier and
749    /// every prior write is globally visible afterwards. Backends
750    /// that lack a native grid barrier (workgroup-only fences) must
751    /// return `false`; registration-based dispatch may lower a
752    /// `GridSync` barrier to a host-orchestrated kernel split only
753    /// when [`VyreBackend::allows_host_grid_sync_split`] also returns
754    /// `true`.
755    ///
756    /// Backends with cooperative whole-grid launch support can return
757    /// `true`; backends limited to workgroup-local synchronization return
758    /// `false` until the target exposes a compatible grid-barrier primitive.
759    ///
760    /// Default: `false`.
761    #[must_use]
762    fn supports_grid_sync(&self) -> bool {
763        false
764    }
765
766    /// Whether a native cooperative grid-sync launch of `program` with these
767    /// `inputs` and `config` can be made fully resident on this device.
768    ///
769    /// [`VyreBackend::supports_grid_sync`] reports that native lowering is
770    /// *available*; this reports whether it *fits* for a specific dispatch. A
771    /// cooperative launch requires every block co-resident, so a grid whose
772    /// block count exceeds the device's cooperative residency cannot run
773    /// natively and must route to the resident-fixpoint or host-split path.
774    /// Orchestrators call this to choose the native route only when it fits,
775    /// avoiding a wasted allocate/upload that would otherwise end in
776    /// [`crate::ErrorCode::CooperativeResidencyExceeded`].
777    ///
778    /// Default: `Ok(false)` (no native cooperative launch). Backends that lower
779    /// grid sync override this with the real residency check. Returns `Ok(false)`
780    /// — not an error — when the program carries no grid-sync barrier, since
781    /// there is then nothing to launch cooperatively.
782    ///
783    /// # Errors
784    ///
785    /// Returns [`BackendError`] if the launch geometry cannot be computed for
786    /// the program/inputs (a structurally invalid dispatch).
787    fn cooperative_grid_sync_fits(
788        &self,
789        _program: &Program,
790        _inputs: &[&[u8]],
791        _config: &DispatchConfig,
792    ) -> Result<bool, BackendError> {
793        Ok(false)
794    }
795
796    /// Whether the shared registry wrapper may emulate whole-grid
797    /// synchronization for this backend by splitting one program into
798    /// multiple host-dispatched kernels.
799    ///
800    /// This exists separately from [`VyreBackend::supports_grid_sync`]
801    /// because a backend can intentionally reject hidden host
802    /// orchestration while native cooperative-grid lowering is absent.
803    /// CUDA uses that policy in the release path so missing native
804    /// grid-barrier lowering is surfaced as an unsupported feature
805    /// instead of silently becoming a slower multi-launch path.
806    ///
807    /// Default: `true` to preserve existing behavior for simple
808    /// backends that intentionally rely on shared split lowering.
809    #[must_use]
810    fn allows_host_grid_sync_split(&self) -> bool {
811        true
812    }
813
814    /// Whether this backend implements the resident half of the contract
815    /// (`allocate_resident` / `upload_resident` / `dispatch_resident_timed` /
816    /// `dispatch_resident_repeated_sequence_read_ranges_into` /
817    /// `download_resident_*` / `free_resident`) well enough to run a
818    /// device-resident dispatch sequence.
819    ///
820    /// Consumers use this to choose
821    /// [`crate::grid_sync::dispatch_resident_grid_sync_fixpoint_into`] (which
822    /// keeps live buffers device-resident across every grid-sync segment and
823    /// fixpoint pass) over the host-orchestrated
824    /// [`crate::grid_sync::dispatch_with_grid_sync_split_into`] (which
825    /// round-trips every live buffer host↔device between segments). Both are
826    /// correct; the choice is a performance route on a probed capability, not
827    /// a silent failure fallback.
828    ///
829    /// Default: `false`. Backends that implement resident dispatch override
830    /// this to `true`.
831    #[must_use]
832    fn supports_resident_dispatch(&self) -> bool {
833        false
834    }
835
836    /// Whether this backend partitions a program across more than one
837    /// physical device / node.
838    ///
839    /// Default: `false` (single-device execution).
840    #[must_use]
841    fn is_distributed(&self) -> bool {
842        false
843    }
844
845    /// Whether this backend lowers distributed collective communication
846    /// nodes (`AllReduce`, `AllGather`, `ReduceScatter`, `Broadcast`).
847    ///
848    /// This is intentionally separate from [`VyreBackend::is_distributed`]:
849    /// a backend may partition work across devices without yet exposing a
850    /// correct collective transport/lowering stack. Default: `false`.
851    #[must_use]
852    fn supports_distributed_collectives(&self) -> bool {
853        false
854    }
855
856    /// Maximum supported workgroup size per axis `[x, y, z]`.
857    ///
858    /// Default: `[1, 1, 1]` (scalar dispatch  -  a backend that has not
859    /// reported a real limit cannot be trusted to execute parallel
860    /// workgroups).
861    #[must_use]
862    fn max_workgroup_size(&self) -> [u32; 3] {
863        [1, 1, 1]
864    }
865
866    /// Maximum number of compute workgroups the backend can launch in one
867    /// dispatch dimension.
868    ///
869    /// Default: `1`, which is safe for scalar/reference backends but must be
870    /// overridden by real GPU backends so schedulers do not under-launch.
871    #[must_use]
872    fn max_compute_workgroups_per_dimension(&self) -> u32 {
873        1
874    }
875
876    /// Maximum total invocations allowed in a single workgroup.
877    ///
878    /// Default derives from [`max_workgroup_size`](Self::max_workgroup_size)
879    /// and fails loudly if a backend reports an unrepresentable product.
880    #[must_use]
881    fn max_compute_invocations_per_workgroup(&self) -> u32 {
882        let [x, y, z] = self.max_workgroup_size();
883        let invocations = u128::from(x) * u128::from(y) * u128::from(z);
884        u32::try_from(invocations).unwrap_or(u32::MAX)
885    }
886
887    /// Native subgroup size for the backing device when the backend
888    /// knows it. Returning
889    /// `None` tells the dispatch planner the backend can't report a
890    /// subgroup width  -  the planner falls back to `max_workgroup_size`
891    /// for its sizing heuristic.
892    ///
893    /// I.6  -  adaptive workgroup sizing reads this capability to pick
894    /// a workgroup multiple of the subgroup so threads don't straddle
895    /// subgroups. Typical devices expose 16, 32, or 64 lanes.
896    #[must_use]
897    fn subgroup_size(&self) -> Option<u32> {
898        None
899    }
900
901    /// Maximum size in bytes of a single storage buffer the backend
902    /// accepts. `0` means the backend has not reported a limit, not
903    /// "unlimited".
904    ///
905    /// Default: `0`.
906    #[must_use]
907    fn max_storage_buffer_bytes(&self) -> u64 {
908        0
909    }
910
911    /// Unified backend-neutral device profile.
912    ///
913    /// Shared planner code should prefer this single profile over reading
914    /// individual capability methods one by one. Concrete backends may
915    /// override it when they can report richer device facts such as shared
916    /// memory size or native lowering-strategy features.
917    #[must_use]
918    fn device_profile(&self) -> crate::DeviceProfile {
919        let max_workgroup_size = self.max_workgroup_size();
920        crate::DeviceProfile {
921            backend: self.id(),
922            supports_subgroup_ops: self.supports_subgroup_ops(),
923            supports_indirect_dispatch: self.supports_indirect_dispatch(),
924            supports_distributed_collectives: self.supports_distributed_collectives(),
925            supports_specialization_constants: false,
926            supports_f16: self.supports_f16(),
927            supports_bf16: self.supports_bf16(),
928            supports_trap_propagation: false,
929            supports_tensor_cores: self.supports_tensor_cores(),
930            has_mul_high: false,
931            has_dual_issue_fp32_int32: false,
932            has_subgroup_shuffle: self.supports_subgroup_ops(),
933            has_shared_memory: false,
934            max_native_int_width: 32,
935            max_workgroup_size,
936            max_invocations_per_workgroup: self.max_compute_invocations_per_workgroup(),
937            max_shared_memory_bytes: 0,
938            max_storage_buffer_binding_size: self.max_storage_buffer_bytes(),
939            subgroup_size: self.subgroup_size().unwrap_or(0),
940            compute_units: 0,
941            regs_per_thread_max: 0,
942            l1_cache_bytes: 0,
943            l2_cache_bytes: 0,
944            mem_bw_gbps: 0,
945            timing_quality: crate::DeviceTimingQuality::HostOnly,
946            supports_device_timestamps: false,
947            supports_hardware_counters: false,
948            ideal_unroll_depth: 0,
949            ideal_vector_pack_bits: 0,
950            ideal_workgroup_tile: [0, 0, 0],
951            shared_memory_bank_count: 0,
952            shared_memory_bank_width_bytes: 0,
953        }
954    }
955
956    // ---------------------------------------------------------------
957    // Lifecycle hooks (defaulted, override as needed).
958    //
959    // These let a backend warm caches, flush pending work, recover from
960    // device loss, or tear down cleanly. Every hook defaults to a
961    // no-op-or-structured-error, so existing impls do not have to add
962    // any code.
963    // ---------------------------------------------------------------
964
965    /// Pre-dispatch warmup. Called before the first dispatch on a new
966    /// program so the backend can warm caches, compile ahead-of-time, or
967    /// acquire a device handle without paying that cost on the hot path.
968    ///
969    /// Default: no-op `Ok(())`.
970    ///
971    /// # Errors
972    ///
973    /// Returns [`BackendError`] if warmup cannot complete.
974    fn prepare(&self) -> Result<(), BackendError> {
975        Ok(())
976    }
977
978    /// Flush any queued work to the device and wait for it to complete.
979    ///
980    /// Useful before tearing down a context or before reading back data
981    /// that was produced by the last asynchronous dispatch.
982    ///
983    /// Default: no-op `Ok(())`  -  backends that do not queue work
984    /// implicitly satisfy flush.
985    ///
986    /// # Errors
987    ///
988    /// Returns [`BackendError`] on device failure.
989    fn flush(&self) -> Result<(), BackendError> {
990        Ok(())
991    }
992
993    /// Release device resources held by this backend. After `shutdown`
994    /// returns the backend is in an unspecified state and may not be
995    /// used for further dispatches.
996    ///
997    /// Default: no-op `Ok(())`.
998    ///
999    /// # Errors
1000    ///
1001    /// Returns [`BackendError`] on device failure during teardown.
1002    fn shutdown(&self) -> Result<(), BackendError> {
1003        Ok(())
1004    }
1005
1006    /// Probe whether the underlying device has been lost since the last
1007    /// successful dispatch.
1008    ///
1009    /// Default: `false` (assume healthy  -  backends that have no
1010    /// device-loss story do not need to probe).
1011    #[must_use]
1012    fn device_lost(&self) -> bool {
1013        false
1014    }
1015
1016    /// Attempt to recover from device loss by reacquiring the underlying
1017    /// device and invalidating pipeline caches.
1018    ///
1019    /// Default: returns an `UnsupportedFeature` error  -  recovery must be
1020    /// opt-in, because a backend that silently re-acquires without
1021    /// notifying the caller is a correctness hazard.
1022    ///
1023    /// # Errors
1024    ///
1025    /// Returns [`BackendError::UnsupportedFeature`] by default. Backends
1026    /// that implement recovery return any error encountered during
1027    /// re-acquisition.
1028    fn try_recover(&self) -> Result<(), BackendError> {
1029        Err(BackendError::UnsupportedFeature {
1030            name: "device recovery".to_string(),
1031            backend: self.id().to_string(),
1032        })
1033    }
1034
1035    /// Allocate a backend-owned device buffer of `byte_len` bytes.
1036    ///
1037    /// The returned [`DeviceBuffer`] handle is only meaningful to the
1038    /// backend that produced it. Backends that have not opted in return
1039    /// [`BackendError::UnsupportedFeature`] with the
1040    /// `DEVICE_BUFFER_FEATURE` name; production callers that require
1041    /// resident-buffer performance must treat that as a hard capability
1042    /// failure rather than silently routing through host `Vec<u8>`
1043    /// dispatch. Real device backends (cuda/wgpu/spirv) override this to
1044    /// wrap their concrete handle (for example, a vendor device allocation,
1045    /// vulkan buffer) in a `DeviceBuffer` impl.
1046    ///
1047    /// See `crate::backend::device_buffer` for the substrate.
1048    ///
1049    /// # Errors
1050    ///
1051    /// Returns [`BackendError::UnsupportedFeature`] when the backend
1052    /// has not yet implemented persistent device-buffer allocation.
1053    fn allocate_device_buffer(
1054        &self,
1055        _byte_len: usize,
1056    ) -> Result<Box<dyn DeviceBuffer>, BackendError> {
1057        Err(unsupported_device_buffer(self.id()))
1058    }
1059
1060    /// Upload host bytes into a previously-allocated device buffer.
1061    ///
1062    /// # Errors
1063    ///
1064    /// Returns [`BackendError`] when the buffer was not allocated by this
1065    /// backend, the byte length does not match the allocation, or the
1066    /// backend has not opted in to device-buffer dispatch.
1067    fn upload_device_buffer(
1068        &self,
1069        _buffer: &mut dyn DeviceBuffer,
1070        _bytes: &[u8],
1071    ) -> Result<(), BackendError> {
1072        Err(unsupported_device_buffer(self.id()))
1073    }
1074
1075    /// Download bytes from a device buffer back to a host `Vec<u8>`.
1076    ///
1077    /// # Errors
1078    ///
1079    /// Returns [`BackendError`] when the buffer was not allocated by this
1080    /// backend or the backend has not opted in to device-buffer dispatch.
1081    fn download_device_buffer(&self, _buffer: &dyn DeviceBuffer) -> Result<Vec<u8>, BackendError> {
1082        Err(unsupported_device_buffer(self.id()))
1083    }
1084
1085    /// Free a device buffer previously returned by
1086    /// [`Self::allocate_device_buffer`]. Explicit-free is required
1087    /// because the substrate does not assume reference-counted backend
1088    /// handles; consumers are responsible for calling this when done.
1089    ///
1090    /// # Errors
1091    ///
1092    /// Returns [`BackendError`] when the buffer was not allocated by this
1093    /// backend or the underlying free fails.
1094    fn free_device_buffer(&self, _buffer: Box<dyn DeviceBuffer>) -> Result<(), BackendError> {
1095        Err(unsupported_device_buffer(self.id()))
1096    }
1097
1098    /// Dispatch a Program with backend-owned device buffers as inputs and
1099    /// outputs.
1100    ///
1101    /// Backends that have implemented [`Self::allocate_device_buffer`]
1102    /// override this method to bind their concrete buffer type without
1103    /// the host upload/download round trip the legacy `dispatch` API
1104    /// requires. Backends that have not opted in return
1105    /// [`BackendError::UnsupportedFeature`]. Callers that choose this
1106    /// API are asking for resident-buffer execution; falling back to
1107    /// host-buffer dispatch would hide the copy cost and violate the
1108    /// performance contract.
1109    ///
1110    /// # Errors
1111    ///
1112    /// Returns [`BackendError::UnsupportedFeature`] by default. Real
1113    /// implementations return any error encountered during dispatch.
1114    fn dispatch_with_device_buffers(
1115        &self,
1116        _program: &Program,
1117        _inputs: &[&dyn DeviceBuffer],
1118        _outputs: &mut [&mut dyn DeviceBuffer],
1119        _config: &DispatchConfig,
1120    ) -> Result<(), BackendError> {
1121        Err(unsupported_device_buffer(self.id()))
1122    }
1123}
1124
1125fn elapsed_resident_sequence_wall_ns(started: std::time::Instant) -> Result<u64, BackendError> {
1126    u64::try_from(started.elapsed().as_nanos()).map_err(|error| BackendError::InvalidProgram {
1127        fix: format!(
1128            "Fix: resident sequence wall timing cannot fit u64 nanoseconds: {error}. Split telemetry windows or report per-step timing."
1129        ),
1130    })
1131}
1132
1133fn sum_optional_timing(
1134    accumulator: Option<u64>,
1135    next: Option<u64>,
1136    field: &'static str,
1137) -> Result<Option<u64>, BackendError> {
1138    match (accumulator, next) {
1139        (Some(left), Some(right)) => Ok(Some(left.checked_add(right).ok_or_else(|| {
1140            BackendError::InvalidProgram {
1141                fix: format!(
1142                    "Fix: resident sequence {field} overflowed u64 nanoseconds. Split telemetry windows or report per-step timing instead of silently clamping."
1143                ),
1144            }
1145        })?)),
1146        _ => Ok(None),
1147    }
1148}
1149
1150#[cfg(test)]
1151
1152mod tests {
1153    use super::*;
1154    use std::sync::atomic::{AtomicUsize, Ordering};
1155
1156    struct TelemetryBackend;
1157
1158    impl private::Sealed for TelemetryBackend {}
1159
1160    impl VyreBackend for TelemetryBackend {
1161        fn id(&self) -> &'static str {
1162            "telemetry-test"
1163        }
1164
1165        fn dispatch(
1166            &self,
1167            _program: &Program,
1168            _inputs: &[Vec<u8>],
1169            _config: &DispatchConfig,
1170        ) -> Result<Vec<Vec<u8>>, BackendError> {
1171            Ok(vec![vec![1, 2], vec![3, 4]])
1172        }
1173    }
1174
1175    struct SequenceTimingBackend {
1176        dispatches: AtomicUsize,
1177    }
1178
1179    impl private::Sealed for SequenceTimingBackend {}
1180
1181    impl VyreBackend for SequenceTimingBackend {
1182        fn id(&self) -> &'static str {
1183            "sequence-timing-test"
1184        }
1185
1186        fn dispatch(
1187            &self,
1188            _program: &Program,
1189            _inputs: &[Vec<u8>],
1190            _config: &DispatchConfig,
1191        ) -> Result<Vec<Vec<u8>>, BackendError> {
1192            Ok(Vec::new())
1193        }
1194
1195        fn dispatch_resident_timed(
1196            &self,
1197            _program: &Program,
1198            _resources: &[Resource],
1199            config: &DispatchConfig,
1200        ) -> Result<TimedDispatchResult, BackendError> {
1201            let index = self.dispatches.fetch_add(1, Ordering::SeqCst) as u64;
1202            assert_eq!(
1203                config.grid_override,
1204                Some([index as u32 + 1, 1, 1]),
1205                "Fix: default resident sequence timing must preserve each step's grid override."
1206            );
1207            Ok(TimedDispatchResult {
1208                outputs: Vec::new(),
1209                wall_ns: 10 + index,
1210                device_ns: Some(7 + index),
1211                enqueue_ns: Some(3 + index),
1212                wait_ns: Some(4 + index),
1213            })
1214        }
1215
1216        fn download_resident_ranges_into(
1217            &self,
1218            ranges: &[(&Resource, usize, usize)],
1219            outputs: &mut [&mut Vec<u8>],
1220        ) -> Result<(), BackendError> {
1221            assert_eq!(ranges.len(), outputs.len());
1222            for ((resource, offset, len), output) in ranges.iter().zip(outputs.iter_mut()) {
1223                let Resource::Resident(id) = resource else {
1224                    panic!("Fix: default timed resident sequence test expects resident resources.");
1225                };
1226                output.clear();
1227                output.extend_from_slice(&id.to_le_bytes());
1228                output.extend_from_slice(&(*offset as u64).to_le_bytes());
1229                output.extend_from_slice(&(*len as u64).to_le_bytes());
1230            }
1231            Ok(())
1232        }
1233    }
1234
1235    #[test]
1236    fn default_borrowed_into_dispatch_records_runtime_telemetry() {
1237        let _guard = crate::observability::audit_events_test_lock();
1238        let before = crate::observability::snapshot_dispatch_telemetry();
1239        let backend = TelemetryBackend;
1240        let mut outputs = vec![Vec::with_capacity(4), Vec::with_capacity(1)];
1241
1242        backend
1243            .dispatch_borrowed_into(
1244                &Program::empty(),
1245                &[&[9, 8, 7]],
1246                &DispatchConfig::default(),
1247                &mut outputs,
1248            )
1249            .expect("Fix: default borrowed-into dispatch must succeed");
1250
1251        let telemetry = crate::observability::snapshot_dispatch_telemetry();
1252        assert!(telemetry.launches >= before.launches + 1);
1253        assert!(telemetry.input_bytes >= before.input_bytes + 3);
1254        assert!(telemetry.output_bytes >= before.output_bytes + 4);
1255        assert!(telemetry.output_slots >= before.output_slots + 2);
1256        assert!(telemetry.output_slots_reused >= before.output_slots_reused + 1);
1257        assert!(telemetry.output_slots_moved >= before.output_slots_moved + 1);
1258        assert!(telemetry.output_slots_appended >= before.output_slots_appended);
1259    }
1260
1261    #[test]
1262    fn default_resident_sequence_timing_sums_step_device_times_and_reads_ranges() {
1263        let backend = SequenceTimingBackend {
1264            dispatches: AtomicUsize::new(0),
1265        };
1266        let program = Program::empty();
1267        let first_resources = [Resource::Resident(11)];
1268        let second_resources = [Resource::Resident(22)];
1269        let steps = [
1270            ResidentDispatchStep {
1271                program: &program,
1272                resources: &first_resources,
1273                grid_override: Some([1, 1, 1]),
1274            },
1275            ResidentDispatchStep {
1276                program: &program,
1277                resources: &second_resources,
1278                grid_override: Some([2, 1, 1]),
1279            },
1280        ];
1281        let read_resource = Resource::Resident(33);
1282        let reads = [ResidentReadRange {
1283            resource: &read_resource,
1284            byte_offset: 4,
1285            byte_len: 8,
1286        }];
1287        let mut output = Vec::new();
1288
1289        let timing = backend
1290            .dispatch_resident_sequence_read_ranges_timed_into(&steps, &reads, &mut [&mut output])
1291            .expect("Fix: default timed resident sequence must execute and read ranges.");
1292
1293        assert_eq!(backend.dispatches.load(Ordering::SeqCst), 2);
1294        assert_eq!(timing.device_ns, Some(15));
1295        assert_eq!(timing.enqueue_ns, Some(7));
1296        assert_eq!(timing.wait_ns, Some(9));
1297        assert!(timing.wall_ns > 0);
1298        assert_eq!(output.len(), 24);
1299        assert_eq!(u64::from_le_bytes(output[0..8].try_into().unwrap()), 33);
1300        assert_eq!(u64::from_le_bytes(output[8..16].try_into().unwrap()), 4);
1301        assert_eq!(u64::from_le_bytes(output[16..24].try_into().unwrap()), 8);
1302    }
1303
1304    #[test]
1305    fn default_dispatch_paths_use_shared_fallible_staging_and_checked_timing() {
1306        let backend_source = include_str!("vyre_backend.rs");
1307        let compiled_source = include_str!("compiled_pipeline.rs");
1308        let module_source = include_str!("../backend.rs");
1309
1310        assert!(
1311            module_source.contains("fn clone_borrowed_inputs_for_dispatch")
1312                && module_source.contains("fn reserve_batch_output_slots")
1313                && module_source.contains("fn checked_elapsed_wall_ns"),
1314            "Fix: backend defaults must share one fallible staging and checked timing contract."
1315        );
1316        for source in [backend_source, compiled_source] {
1317            let production = source
1318                .split("#[cfg(test)]")
1319                .next()
1320                .expect("Fix: backend source must contain production section before tests");
1321            assert!(
1322                production.contains("clone_borrowed_inputs_for_dispatch")
1323                    && production.contains("checked_elapsed_wall_ns")
1324                    && !production.contains(".as_nanos() as u64")
1325                    && !production.contains("inputs.iter().map(|input| (*input).to_vec()).collect()"),
1326                "Fix: inherited backend dispatch defaults must avoid infallible borrowed-input collection and lossy wall timing."
1327            );
1328        }
1329        let compiled_production = compiled_source
1330            .split("#[cfg(test)]")
1331            .next()
1332            .expect("Fix: compiled pipeline source must contain production section before tests");
1333        assert!(
1334            compiled_production.contains("reserved_batch_output_slots")
1335                && !compiled_production.contains("Vec::with_capacity(batches.len())"),
1336            "Fix: compiled-pipeline batch defaults must construct output slots through shared fallible staging."
1337        );
1338    }
1339}