Skip to main content

vyre_driver/backend/
vyre_backend.rs

1//! The frozen `VyreBackend` contract.
2
3use std::sync::Arc;
4
5use smallvec::SmallVec;
6use vyre_foundation::ir::Program;
7
8use crate::backend::{
9    device_buffer::unsupported_device_buffer, private, BackendError, CompiledPipeline,
10    DeviceBuffer, DispatchConfig, OutputBuffers, PendingDispatch, Resource, TimedDispatchResult,
11};
12
13/// One backend-resident program dispatch in an ordered sequence.
14pub struct ResidentDispatchStep<'a> {
15    /// Program to dispatch.
16    pub program: &'a Program,
17    /// Resident resources in binding order.
18    pub resources: &'a [Resource],
19    /// Optional CUDA/grid-style launch override.
20    pub grid_override: Option<[u32; 3]>,
21    /// Optional workgroup override. MUST be carried alongside `grid_override`:
22    /// a caller that sizes its grid for a specific workgroup (grid =
23    /// ceil(work / workgroup)) will under-cover the work if the step falls back
24    /// to a different default workgroup. `None` keeps the backend's resolved
25    /// default (correct only when `grid_override` is also `None`).
26    pub workgroup_override: Option<[u32; 3]>,
27}
28
29/// One compact byte range to read from a backend-resident resource.
30pub struct ResidentReadRange<'a> {
31    /// Resident resource to read.
32    pub resource: &'a Resource,
33    /// Byte offset inside the resident resource.
34    pub byte_offset: usize,
35    /// Number of bytes to read.
36    pub byte_len: usize,
37}
38
39/// Timing captured for an ordered resident dispatch sequence.
40#[derive(Clone, Debug, Default, Eq, PartialEq)]
41pub struct ResidentSequenceTiming {
42    /// Host-observed sequence duration including requested readbacks.
43    pub wall_ns: u64,
44    /// Device-observed elapsed dispatch time when the backend exposes timers.
45    pub device_ns: Option<u64>,
46    /// Host time spent enqueueing backend work before waiting.
47    pub enqueue_ns: Option<u64>,
48    /// Host time spent waiting for completion and collecting outputs.
49    pub wait_ns: Option<u64>,
50}
51
52/// The frozen contract between vyre and every execution backend.
53///
54/// A backend is a pure function from a validated `Program` and input buffers
55/// to output buffers. Implementations must be `Send + Sync`, deterministic
56/// for identical inputs, and byte-identical to the CPU reference on success.
57/// This trait is the keystone of the vyre abstraction thesis: frontends do
58/// not know which backend runs their IR, and backends do not know which
59/// frontend produced it.
60///
61/// # Examples
62///
63pub trait VyreBackend: private::Sealed + Send + Sync {
64    /// Stable backend identifier used for logging, certificates, and adapter selection.
65    ///
66    /// The identifier must be unique among all backends linked into the
67    /// current process. Conformance reports include this string so that
68    /// consumers know exactly which implementation was certified.
69    fn id(&self) -> &'static str;
70
71    /// Backend implementation version string used for certificates and
72    /// regression tracking.
73    ///
74    /// The default returns `"unspecified"`. Concrete backends should
75    /// override this with their crate version (e.g. `"0.4.0"`) so that
76    /// certificates can detect backend upgrades that may require re-cert.
77    fn version(&self) -> &'static str {
78        "unspecified"
79    }
80
81    /// Operation ids this backend can execute without further lowering.
82    fn supported_ops(&self) -> &std::collections::HashSet<vyre_foundation::ir::OpId> {
83        use crate::backend::validation::default_supported_ops;
84        default_supported_ops()
85    }
86
87    // Raw backend shader text is a concrete-driver implementation
88    // detail, not part of the substrate-neutral `VyreBackend`
89    // contract.
90
91    /// Executes the program with the given input buffers and returns the output buffers.
92    ///
93    /// On success the returned bytes must match the pure-Rust reference
94    /// implementation bit-for-bit. On failure the backend must return a
95    /// [`BackendError`] whose message contains an actionable `Fix: ` hint.
96    ///
97    /// # Examples
98    ///
99    /// ```no_run
100    /// use vyre::{Program, VyreBackend, DispatchConfig};
101    ///
102    /// # fn example(backend: &dyn VyreBackend, program: &Program) -> Result<Vec<Vec<u8>>, vyre::BackendError> {
103    /// let inputs = vec![vec![1u8, 2, 3]];
104    /// let config = DispatchConfig::default();
105    /// backend.dispatch(program, &inputs, &config)
106    /// # }
107    /// ```
108    ///
109    /// # Errors
110    ///
111    /// Returns [`BackendError`] when the backend cannot complete dispatch.
112    /// The error message always includes a `Fix: ` remediation section.
113    fn dispatch(
114        &self,
115        program: &Program,
116        inputs: &[Vec<u8>],
117        config: &DispatchConfig,
118    ) -> Result<Vec<Vec<u8>>, BackendError>;
119
120    /// Executes the program with borrowed input buffers.
121    ///
122    /// Backends may override this method to avoid staging borrowed bytes into
123    /// owned `Vec<u8>` buffers. The default is non-breaking: it performs one
124    /// owned vector allocation for the call and delegates to
125    /// [`VyreBackend::dispatch`].
126    ///
127    /// # Errors
128    ///
129    /// Returns [`BackendError`] when the backend cannot complete dispatch.
130    fn dispatch_borrowed(
131        &self,
132        program: &Program,
133        inputs: &[&[u8]],
134        config: &DispatchConfig,
135    ) -> Result<Vec<Vec<u8>>, BackendError> {
136        let owned =
137            crate::backend::clone_borrowed_inputs_for_dispatch(inputs, "backend input staging")?;
138        let outputs = self.dispatch(program, &owned, config)?;
139        crate::observability::record_dispatch_io(inputs, &outputs);
140        Ok(outputs)
141    }
142
143    /// Executes a borrowed-input dispatch and returns backend-owned timing.
144    ///
145    /// The default records host wall time and delegates to
146    /// [`VyreBackend::dispatch_borrowed`]. Device-specific backends override
147    /// this only inside their driver crates so benchmark crates never import
148    /// vendor APIs directly.
149    ///
150    /// # Errors
151    ///
152    /// Returns [`BackendError`] when the backend cannot complete dispatch.
153    fn dispatch_borrowed_timed(
154        &self,
155        program: &Program,
156        inputs: &[&[u8]],
157        config: &DispatchConfig,
158    ) -> Result<TimedDispatchResult, BackendError> {
159        let started = std::time::Instant::now();
160        let outputs = self.dispatch_borrowed(program, inputs, config)?;
161        Ok(TimedDispatchResult {
162            outputs,
163            wall_ns: crate::backend::checked_elapsed_wall_ns(started, "backend borrowed dispatch")?,
164            device_ns: None,
165            enqueue_ns: None,
166            wait_ns: None,
167        })
168    }
169
170    /// Executes the program with borrowed input buffers and writes outputs into
171    /// caller-owned storage.
172    ///
173    /// Backends may override this method to reuse output buffers across
174    /// dispatches. The default preserves the existing dispatch contract and
175    /// copies returned bytes into existing output slots where possible.
176    ///
177    /// # Errors
178    ///
179    /// Returns [`BackendError`] when the backend cannot complete dispatch.
180    fn dispatch_borrowed_into(
181        &self,
182        program: &Program,
183        inputs: &[&[u8]],
184        config: &DispatchConfig,
185        outputs: &mut OutputBuffers,
186    ) -> Result<(), BackendError> {
187        let result = self.dispatch_borrowed(program, inputs, config)?;
188        let stats = crate::backend::dispatch_result::replace_output_buffers_preserving_slots_with_memory_stats(
189            result,
190            outputs,
191        );
192        crate::observability::record_output_replacement_stats(stats);
193        Ok(())
194    }
195
196    /// Allocate a backend-resident buffer and return a stable resource handle.
197    ///
198    /// Backends that support resident resources override this method so callers
199    /// can keep hot inputs on the device without importing a concrete driver
200    /// crate. The returned [`Resource`] is only meaningful to the backend that
201    /// produced it.
202    ///
203    /// # Errors
204    ///
205    /// Returns [`BackendError`] when the backend cannot allocate a resident
206    /// resource of the requested size.
207    fn allocate_resident(&self, _byte_len: usize) -> Result<Resource, BackendError> {
208        Err(BackendError::UnsupportedFeature {
209            name: "resident buffer allocation".to_string(),
210            backend: self.id().to_string(),
211        })
212    }
213
214    /// Upload bytes into a backend-resident resource.
215    ///
216    /// # Errors
217    ///
218    /// Returns [`BackendError`] when the resource is not owned by this backend
219    /// or the byte length does not match the resident allocation.
220    fn upload_resident(&self, _resource: &Resource, _bytes: &[u8]) -> Result<(), BackendError> {
221        Err(BackendError::UnsupportedFeature {
222            name: "resident buffer upload".to_string(),
223            backend: self.id().to_string(),
224        })
225    }
226
227    /// Upload several backend-resident resources as one logical staging
228    /// operation.
229    ///
230    /// Backends that support resident graph/dataflow hot paths should override
231    /// this with a native batched transfer. The default fails loudly instead
232    /// of looping over [`VyreBackend::upload_resident`], because a hidden
233    /// per-buffer synchronization loop destroys the performance contract.
234    ///
235    /// # Errors
236    ///
237    /// Returns [`BackendError`] when the backend cannot batch resident uploads
238    /// or when any resource/byte length is invalid.
239    fn upload_resident_many(&self, _uploads: &[(&Resource, &[u8])]) -> Result<(), BackendError> {
240        Err(BackendError::UnsupportedFeature {
241            name: "resident buffer batch upload".to_string(),
242            backend: self.id().to_string(),
243        })
244    }
245
246    /// Upload bytes into a subrange of a backend-resident resource.
247    ///
248    /// This is the hot-loop path for reusable resident slots whose capacity is
249    /// larger than the current logical payload. Backends must not require the
250    /// upload length to equal the allocation length.
251    ///
252    /// # Errors
253    ///
254    /// Returns [`BackendError`] when ranged upload is unsupported, the resource
255    /// is not owned by this backend, or the destination range is out of bounds.
256    fn upload_resident_at(
257        &self,
258        _resource: &Resource,
259        _dst_offset_bytes: usize,
260        _bytes: &[u8],
261    ) -> Result<(), BackendError> {
262        Err(BackendError::UnsupportedFeature {
263            name: "resident buffer ranged upload".to_string(),
264            backend: self.id().to_string(),
265        })
266    }
267
268    /// Upload several resident subranges as one logical staging operation.
269    ///
270    /// The default fails loudly instead of looping over
271    /// [`VyreBackend::upload_resident_at`], because hidden per-range
272    /// synchronization breaks resident hot-loop performance.
273    ///
274    /// # Errors
275    ///
276    /// Returns [`BackendError`] when ranged batch upload is unsupported or when
277    /// any resource/range is invalid.
278    fn upload_resident_at_many(
279        &self,
280        _uploads: &[(&Resource, usize, &[u8])],
281    ) -> Result<(), BackendError> {
282        Err(BackendError::UnsupportedFeature {
283            name: "resident buffer ranged batch upload".to_string(),
284            backend: self.id().to_string(),
285        })
286    }
287
288    /// Download a backend-resident resource into a new host buffer.
289    ///
290    /// Prefer [`VyreBackend::download_resident_into`] in hot loops so repeated
291    /// validation does not allocate a fresh `Vec` for every readback.
292    ///
293    /// # Errors
294    ///
295    /// Returns [`BackendError`] when the backend cannot download resident
296    /// resources or when `resource` is not owned by this backend.
297    fn download_resident(&self, resource: &Resource) -> Result<Vec<u8>, BackendError> {
298        let mut bytes = Vec::new();
299        self.download_resident_into(resource, &mut bytes)?;
300        Ok(bytes)
301    }
302
303    /// Download a backend-resident resource into caller-owned storage.
304    ///
305    /// Implementations must clear and reuse `out`; hidden compatibility
306    /// allocation defeats resident hot-loop validation.
307    ///
308    /// # Errors
309    ///
310    /// Returns [`BackendError`] when resident download is unsupported or when
311    /// `resource` is not owned by this backend.
312    fn download_resident_into(
313        &self,
314        _resource: &Resource,
315        _out: &mut Vec<u8>,
316    ) -> Result<(), BackendError> {
317        Err(BackendError::UnsupportedFeature {
318            name: "resident buffer download".to_string(),
319            backend: self.id().to_string(),
320        })
321    }
322
323    /// Download a byte range from a backend-resident resource into a new host
324    /// buffer.
325    ///
326    /// Prefer [`VyreBackend::download_resident_range_into`] in hot loops.
327    ///
328    /// # Errors
329    ///
330    /// Returns [`BackendError`] when resident ranged download is unsupported,
331    /// the range is invalid, or `resource` is not owned by this backend.
332    fn download_resident_range(
333        &self,
334        resource: &Resource,
335        byte_offset: usize,
336        byte_len: usize,
337    ) -> Result<Vec<u8>, BackendError> {
338        let mut bytes = Vec::new();
339        bytes.try_reserve_exact(byte_len).map_err(|error| {
340            BackendError::InvalidProgram {
341                fix: format!(
342                    "Fix: resident ranged download could not reserve {byte_len} output byte(s): {error}. Split the readback range before dispatch."
343                ),
344            }
345        })?;
346        self.download_resident_range_into(resource, byte_offset, byte_len, &mut bytes)?;
347        Ok(bytes)
348    }
349
350    /// Download a byte range from a backend-resident resource into
351    /// caller-owned storage.
352    ///
353    /// # Errors
354    ///
355    /// Returns [`BackendError`] when resident ranged download is unsupported,
356    /// the range is invalid, or `resource` is not owned by this backend.
357    fn download_resident_range_into(
358        &self,
359        _resource: &Resource,
360        _byte_offset: usize,
361        _byte_len: usize,
362        _out: &mut Vec<u8>,
363    ) -> Result<(), BackendError> {
364        Err(BackendError::UnsupportedFeature {
365            name: "resident buffer ranged download".to_string(),
366            backend: self.id().to_string(),
367        })
368    }
369
370    /// Download several byte ranges from backend-resident resources into
371    /// caller-owned storage as one logical readback operation.
372    ///
373    /// Backends with a real multi-readback path should override this to issue
374    /// all copies behind one backend synchronization boundary.
375    ///
376    /// # Errors
377    ///
378    /// Returns [`BackendError`] when counts do not match, any range is invalid,
379    /// or resident ranged download is unsupported.
380    fn download_resident_ranges_into(
381        &self,
382        ranges: &[(&Resource, usize, usize)],
383        outputs: &mut [&mut Vec<u8>],
384    ) -> Result<(), BackendError> {
385        if ranges.len() != outputs.len() {
386            return Err(BackendError::InvalidProgram {
387                fix: format!(
388                    "Fix: resident ranged batch download expected matching range/output counts but got {} range(s) and {} output(s).",
389                    ranges.len(),
390                    outputs.len()
391                ),
392            });
393        }
394        for ((resource, byte_offset, byte_len), output) in ranges.iter().zip(outputs.iter_mut()) {
395            self.download_resident_range_into(resource, *byte_offset, *byte_len, output)?;
396        }
397        Ok(())
398    }
399
400    /// Free a backend-resident resource previously returned by
401    /// [`VyreBackend::allocate_resident`].
402    ///
403    /// # Errors
404    ///
405    /// Returns [`BackendError`] when the resource is unknown or still in use.
406    fn free_resident(&self, _resource: Resource) -> Result<(), BackendError> {
407        Err(BackendError::UnsupportedFeature {
408            name: "resident buffer free".to_string(),
409            backend: self.id().to_string(),
410        })
411    }
412
413    /// Dispatch using backend-resident resources and return backend-owned
414    /// timing.
415    ///
416    /// # Errors
417    ///
418    /// Returns [`BackendError`] when the backend does not support resident
419    /// dispatch or any resource is invalid for the program.
420    fn dispatch_resident_timed(
421        &self,
422        _program: &Program,
423        _resources: &[Resource],
424        _config: &DispatchConfig,
425    ) -> Result<TimedDispatchResult, BackendError> {
426        Err(BackendError::UnsupportedFeature {
427            name: "resident timed dispatch".to_string(),
428            backend: self.id().to_string(),
429        })
430    }
431
432    /// Dispatch an ordered sequence of resident-buffer programs and read
433    /// selected resident byte ranges into caller-owned storage.
434    ///
435    /// The default preserves correctness by dispatching each step through
436    /// [`VyreBackend::dispatch_resident_timed`] and then calling
437    /// [`VyreBackend::download_resident_ranges_into`]. CUDA overrides this to
438    /// enqueue the whole dependent chain plus D2H readbacks on one stream and
439    /// pay one host synchronization point.
440    ///
441    /// # Errors
442    ///
443    /// Returns [`BackendError`] when any step fails, when readback ranges are
444    /// invalid, or when the backend cannot perform resident dispatch/readback.
445    fn dispatch_resident_sequence_read_ranges_into(
446        &self,
447        steps: &[ResidentDispatchStep<'_>],
448        read_ranges: &[ResidentReadRange<'_>],
449        outputs: &mut [&mut Vec<u8>],
450    ) -> Result<(), BackendError> {
451        for step in steps {
452            let mut config = DispatchConfig::default();
453            config.grid_override = step.grid_override;
454            self.dispatch_resident_timed(step.program, step.resources, &config)?;
455        }
456        let ranges = read_ranges
457            .iter()
458            .map(|range| (range.resource, range.byte_offset, range.byte_len))
459            .collect::<SmallVec<[_; 8]>>();
460        self.download_resident_ranges_into(&ranges, outputs)
461    }
462
463    /// Timed variant of
464    /// [`VyreBackend::dispatch_resident_sequence_read_ranges_into`].
465    ///
466    /// The default preserves correctness by summing each step's resident
467    /// dispatch timing and then downloading the requested ranges. Backends with
468    /// a fused resident sequence path should override this to keep their
469    /// optimized stream/queue behavior while exposing device timing.
470    ///
471    /// # Errors
472    ///
473    /// Returns [`BackendError`] when any step fails, when readback ranges are
474    /// invalid, or when resident sequence timing overflows.
475    fn dispatch_resident_sequence_read_ranges_timed_into(
476        &self,
477        steps: &[ResidentDispatchStep<'_>],
478        read_ranges: &[ResidentReadRange<'_>],
479        outputs: &mut [&mut Vec<u8>],
480    ) -> Result<ResidentSequenceTiming, BackendError> {
481        let started = std::time::Instant::now();
482        let mut device_ns = Some(0_u64);
483        let mut enqueue_ns = Some(0_u64);
484        let mut wait_ns = Some(0_u64);
485        for step in steps {
486            let mut config = DispatchConfig::default();
487            config.grid_override = step.grid_override;
488            let timed = self.dispatch_resident_timed(step.program, step.resources, &config)?;
489            device_ns = sum_optional_timing(device_ns, timed.device_ns, "device timing")?;
490            enqueue_ns = sum_optional_timing(enqueue_ns, timed.enqueue_ns, "enqueue timing")?;
491            wait_ns = sum_optional_timing(wait_ns, timed.wait_ns, "wait timing")?;
492        }
493        let ranges = read_ranges
494            .iter()
495            .map(|range| (range.resource, range.byte_offset, range.byte_len))
496            .collect::<SmallVec<[_; 8]>>();
497        self.download_resident_ranges_into(&ranges, outputs)?;
498        Ok(ResidentSequenceTiming {
499            wall_ns: elapsed_resident_sequence_wall_ns(started)?,
500            device_ns,
501            enqueue_ns,
502            wait_ns,
503        })
504    }
505
506    /// Dispatch a resident prefix, repeat a resident sub-sequence, and read
507    /// selected resident byte ranges into caller-owned storage.
508    ///
509    /// This is the fixed-point hot-path contract: dataflow clients can express
510    /// a repeated kernel group without allocating one host sequence entry per
511    /// iteration. Backends that understand repetition should override this to
512    /// keep launch preparation and parameter upload sublinear in
513    /// `repeat_count`.
514    ///
515    /// # Errors
516    ///
517    /// Returns [`BackendError`] when any step fails, when readback ranges are
518    /// invalid, or when the backend cannot perform resident dispatch/readback.
519    fn dispatch_resident_repeated_sequence_read_ranges_into(
520        &self,
521        prefix_steps: &[ResidentDispatchStep<'_>],
522        repeated_steps: &[ResidentDispatchStep<'_>],
523        repeat_count: u32,
524        read_ranges: &[ResidentReadRange<'_>],
525        outputs: &mut [&mut Vec<u8>],
526    ) -> Result<(), BackendError> {
527        for step in prefix_steps {
528            let mut config = DispatchConfig::default();
529            config.grid_override = step.grid_override;
530            self.dispatch_resident_timed(step.program, step.resources, &config)?;
531        }
532        for _ in 0..repeat_count {
533            for step in repeated_steps {
534                let mut config = DispatchConfig::default();
535                config.grid_override = step.grid_override;
536                self.dispatch_resident_timed(step.program, step.resources, &config)?;
537            }
538        }
539        let ranges = read_ranges
540            .iter()
541            .map(|range| (range.resource, range.byte_offset, range.byte_len))
542            .collect::<SmallVec<[_; 8]>>();
543        self.download_resident_ranges_into(&ranges, outputs)
544    }
545
546    /// Optional pre-compilation hook for the pipeline-mode API.
547    ///
548    /// Default returns `Ok(None)`  -  the framework wraps in a passthrough
549    /// pipeline whose `dispatch` calls back into [`VyreBackend::dispatch`]
550    /// every time. Backends that genuinely cache compiled state (compute
551    /// pipeline, bind-group layout, lowered shader text) override this and
552    /// return `Ok(Some(...))` so repeated dispatches skip the compilation
553    /// overhead.
554    ///
555    /// The returned pipeline MUST be bit-identical to repeated
556    /// `dispatch(program, inputs, config)` for the program it was compiled
557    /// from. The cache key is the backend's responsibility  -  the framework
558    /// does not deduplicate compile calls.
559    ///
560    /// Implementing this method is the P-6 contract from
561    /// `docs/audits/ROADMAP_PERFORMANCE.md`: "compile target-text + pipeline +
562    /// bind-group-layout once; dispatch repeatedly with different inputs."
563    ///
564    /// # Errors
565    ///
566    /// Returns [`BackendError`] when the backend cannot complete the
567    /// pre-compilation. Callers should treat this as fatal for the program
568    /// (the program will not dispatch successfully via any path).
569    fn compile_native(
570        &self,
571        _program: &Program,
572        _config: &DispatchConfig,
573    ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
574        Ok(None)
575    }
576
577    /// Optional pre-compilation hook for callers that already own a shared
578    /// program allocation.
579    ///
580    /// Backends that store the program inside the compiled pipeline should
581    /// override this method and keep the supplied [`Arc<Program>`] instead of
582    /// cloning the IR. The default preserves the older borrowed-program hook
583    /// for backends that only inspect the program while compiling.
584    ///
585    /// # Errors
586    ///
587    /// Returns [`BackendError`] when backend-native compilation fails.
588    fn compile_native_shared(
589        &self,
590        program: Arc<Program>,
591        config: &DispatchConfig,
592    ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
593        self.compile_native(&program, config)
594    }
595
596    /// Optional compiled-pipeline cache counters for compile telemetry.
597    ///
598    /// Return `None` unless the backend can report real cache hits and misses.
599    fn pipeline_cache_snapshot(&self) -> Option<crate::pipeline::PipelineCacheSnapshot> {
600        None
601    }
602
603    /// Optional backend-specific numeric telemetry for release evidence.
604    ///
605    /// Return an empty vector unless the backend can report real counters.
606    /// Metric names must be stable ASCII identifiers suitable for JSON and
607    /// Prometheus-style export.
608    fn backend_metric_snapshot(&self) -> Vec<(&'static str, u64)> {
609        Vec::new()
610    }
611
612    /// Non-blocking dispatch primitive.
613    ///
614    /// Returns a [`PendingDispatch`] handle immediately; the caller
615    /// polls via [`PendingDispatch::is_ready`] and consumes the result
616    /// via [`PendingDispatch::await_result`]. Backends that genuinely
617    /// pipeline dispatches override this so N concurrent dispatches
618    /// do not serialize on the host.
619    ///
620    /// Default: run the synchronous [`VyreBackend::dispatch`] path and
621    /// wrap the result in a trivially-ready handle. This keeps every
622    /// backend useful from the async API without forcing an async
623    /// rewrite.
624    ///
625    /// # Errors
626    ///
627    /// Returns [`BackendError`] if the dispatch cannot start. Errors
628    /// that surface only during GPU execution come back through
629    /// [`PendingDispatch::await_result`], not from this call.
630    fn dispatch_async(
631        &self,
632        program: &Program,
633        inputs: &[Vec<u8>],
634        config: &DispatchConfig,
635    ) -> Result<Box<dyn PendingDispatch>, BackendError> {
636        let outputs = self.dispatch(program, inputs, config)?;
637        Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
638            outputs,
639        }))
640    }
641
642    /// Non-blocking dispatch with borrowed input buffers.
643    ///
644    /// Backends that record GPU commands synchronously before returning can
645    /// override this to avoid cloning input buffers just to create a pending
646    /// handle. The returned [`PendingDispatch`] must not borrow from `inputs`.
647    ///
648    /// # Errors
649    ///
650    /// Returns [`BackendError`] if the dispatch cannot start.
651    fn dispatch_borrowed_async(
652        &self,
653        program: &Program,
654        inputs: &[&[u8]],
655        config: &DispatchConfig,
656    ) -> Result<Box<dyn PendingDispatch>, BackendError> {
657        let outputs = self.dispatch_borrowed(program, inputs, config)?;
658        Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
659            outputs,
660        }))
661    }
662
663    // ---------------------------------------------------------------
664    // Capability queries (all default to conservative "no" / minimal).
665    //
666    // These are the stable capability surface. Additional backends implement
667    // this trait by default-inheriting every capability below and OVERRIDING
668    // only the ones where they are more capable than the conservative floor.
669    // This means adding a backend is strictly additive  -  no existing
670    // backend impl has to change when a new capability query is added.
671    //
672    // Backends MUST report HONESTLY. Returning `true` from a capability
673    // query is a promise the lowering path emits the corresponding
674    // intrinsic and the adapter supports it. "Supported but broken" is a
675    // LAW 9 evasion (see AGENTS.md). If the feature bit is set on the
676    // device but the lowering emits a slower emulation sequence, the
677    // answer is `false` until the native lowering catches up.
678    // ---------------------------------------------------------------
679
680    /// Whether this backend's lowering path emits subgroup / wave
681    /// intrinsics AND the current adapter exposes them.
682    ///
683    /// Default: `false` (conservative  -  assumes no native subgroup lowering).
684    #[must_use]
685    fn supports_subgroup_ops(&self) -> bool {
686        false
687    }
688
689    /// Whether this backend lowers IEEE 754 binary16 (`DataType::F16`)
690    /// natively rather than emulating through `f32`.
691    ///
692    /// Default: `false`.
693    #[must_use]
694    fn supports_f16(&self) -> bool {
695        false
696    }
697
698    /// Whether this backend lowers bfloat16 (`DataType::BF16`) natively.
699    ///
700    /// Default: `false`.
701    #[must_use]
702    fn supports_bf16(&self) -> bool {
703        false
704    }
705
706    /// Whether this backend emits tensor-core / matrix-engine intrinsics
707    /// for supported tensor shapes.
708    ///
709    /// Default: `false`.
710    #[must_use]
711    fn supports_tensor_cores(&self) -> bool {
712        false
713    }
714
715    /// Whether this backend overlaps copies and compute via independent
716    /// queues or async engines.
717    ///
718    /// Default: `false` (host serializes copy ↔ compute).
719    #[must_use]
720    fn supports_async_compute(&self) -> bool {
721        false
722    }
723
724    /// Whether this backend supports indirect dispatch
725    /// (`Node::IndirectDispatch`).
726    ///
727    /// Default: `false`.
728    #[must_use]
729    fn supports_indirect_dispatch(&self) -> bool {
730        false
731    }
732
733    /// Whether this backend supports speculative dispatch  -  a fused
734    /// prefilter + confirmer kernel with commit-gated output and a
735    /// counter tail read back by the host.
736    ///
737    /// Default: `false`.
738    #[must_use]
739    fn supports_speculation(&self) -> bool {
740        false
741    }
742
743    /// Whether this backend supports device-side persistent-thread
744    /// dispatch (a long-running kernel that polls a work queue).
745    ///
746    /// Default: `false`.
747    #[must_use]
748    fn supports_persistent_thread_dispatch(&self) -> bool {
749        false
750    }
751
752    /// Whether this backend can satisfy `Node::Barrier { ordering:
753    /// MemoryOrdering::GridSync }` inside a single dispatch  -  i.e.
754    /// every thread in the entire grid waits at the barrier and
755    /// every prior write is globally visible afterwards. Backends
756    /// that lack a native grid barrier (workgroup-only fences) must
757    /// return `false`; registration-based dispatch may lower a
758    /// `GridSync` barrier to a host-orchestrated kernel split only
759    /// when [`VyreBackend::allows_host_grid_sync_split`] also returns
760    /// `true`.
761    ///
762    /// Backends with cooperative whole-grid launch support can return
763    /// `true`; backends limited to workgroup-local synchronization return
764    /// `false` until the target exposes a compatible grid-barrier primitive.
765    ///
766    /// Default: `false`.
767    #[must_use]
768    fn supports_grid_sync(&self) -> bool {
769        false
770    }
771
772    /// Whether a native cooperative grid-sync launch of `program` with these
773    /// `inputs` and `config` can be made fully resident on this device.
774    ///
775    /// [`VyreBackend::supports_grid_sync`] reports that native lowering is
776    /// *available*; this reports whether it *fits* for a specific dispatch. A
777    /// cooperative launch requires every block co-resident, so a grid whose
778    /// block count exceeds the device's cooperative residency cannot run
779    /// natively and must route to the resident-fixpoint or host-split path.
780    /// Orchestrators call this to choose the native route only when it fits,
781    /// avoiding a wasted allocate/upload that would otherwise end in
782    /// [`crate::ErrorCode::CooperativeResidencyExceeded`].
783    ///
784    /// Default: `Ok(false)` (no native cooperative launch). Backends that lower
785    /// grid sync override this with the real residency check. Returns `Ok(false)`
786    /// — not an error — when the program carries no grid-sync barrier, since
787    /// there is then nothing to launch cooperatively.
788    ///
789    /// # Errors
790    ///
791    /// Returns [`BackendError`] if the launch geometry cannot be computed for
792    /// the program/inputs (a structurally invalid dispatch).
793    fn cooperative_grid_sync_fits(
794        &self,
795        _program: &Program,
796        _inputs: &[&[u8]],
797        _config: &DispatchConfig,
798    ) -> Result<bool, BackendError> {
799        Ok(false)
800    }
801
802    /// Whether the shared registry wrapper may emulate whole-grid
803    /// synchronization for this backend by splitting one program into
804    /// multiple host-dispatched kernels.
805    ///
806    /// This exists separately from [`VyreBackend::supports_grid_sync`]
807    /// because a backend can intentionally reject hidden host
808    /// orchestration while native cooperative-grid lowering is absent.
809    /// CUDA uses that policy in the release path so missing native
810    /// grid-barrier lowering is surfaced as an unsupported feature
811    /// instead of silently becoming a slower multi-launch path.
812    ///
813    /// Default: `true` to preserve existing behavior for simple
814    /// backends that intentionally rely on shared split lowering.
815    #[must_use]
816    fn allows_host_grid_sync_split(&self) -> bool {
817        true
818    }
819
820    /// Whether this backend implements the resident half of the contract
821    /// (`allocate_resident` / `upload_resident` / `dispatch_resident_timed` /
822    /// `dispatch_resident_repeated_sequence_read_ranges_into` /
823    /// `download_resident_*` / `free_resident`) well enough to run a
824    /// device-resident dispatch sequence.
825    ///
826    /// Consumers use this to choose
827    /// [`crate::grid_sync::dispatch_resident_grid_sync_fixpoint_into`] (which
828    /// keeps live buffers device-resident across every grid-sync segment and
829    /// fixpoint pass) over the host-orchestrated
830    /// [`crate::grid_sync::dispatch_with_grid_sync_split_into`] (which
831    /// round-trips every live buffer host↔device between segments). Both are
832    /// correct; the choice is a performance route on a probed capability, not
833    /// a silent failure fallback.
834    ///
835    /// Default: `false`. Backends that implement resident dispatch override
836    /// this to `true`.
837    #[must_use]
838    fn supports_resident_dispatch(&self) -> bool {
839        false
840    }
841
842    /// Whether this backend partitions a program across more than one
843    /// physical device / node.
844    ///
845    /// Default: `false` (single-device execution).
846    #[must_use]
847    fn is_distributed(&self) -> bool {
848        false
849    }
850
851    /// Whether this backend lowers distributed collective communication
852    /// nodes (`AllReduce`, `AllGather`, `ReduceScatter`, `Broadcast`).
853    ///
854    /// This is intentionally separate from [`VyreBackend::is_distributed`]:
855    /// a backend may partition work across devices without yet exposing a
856    /// correct collective transport/lowering stack. Default: `false`.
857    #[must_use]
858    fn supports_distributed_collectives(&self) -> bool {
859        false
860    }
861
862    /// Maximum supported workgroup size per axis `[x, y, z]`.
863    ///
864    /// Default: `[1, 1, 1]` (scalar dispatch  -  a backend that has not
865    /// reported a real limit cannot be trusted to execute parallel
866    /// workgroups).
867    #[must_use]
868    fn max_workgroup_size(&self) -> [u32; 3] {
869        [1, 1, 1]
870    }
871
872    /// Maximum number of compute workgroups the backend can launch in one
873    /// dispatch dimension.
874    ///
875    /// Default: `1`, which is safe for scalar/reference backends but must be
876    /// overridden by real GPU backends so schedulers do not under-launch.
877    #[must_use]
878    fn max_compute_workgroups_per_dimension(&self) -> u32 {
879        1
880    }
881
882    /// Maximum total invocations allowed in a single workgroup.
883    ///
884    /// Default derives from [`max_workgroup_size`](Self::max_workgroup_size)
885    /// and fails loudly if a backend reports an unrepresentable product.
886    #[must_use]
887    fn max_compute_invocations_per_workgroup(&self) -> u32 {
888        let [x, y, z] = self.max_workgroup_size();
889        let invocations = u128::from(x) * u128::from(y) * u128::from(z);
890        u32::try_from(invocations).unwrap_or(u32::MAX)
891    }
892
893    /// Native subgroup size for the backing device when the backend
894    /// knows it. Returning
895    /// `None` tells the dispatch planner the backend can't report a
896    /// subgroup width  -  the planner falls back to `max_workgroup_size`
897    /// for its sizing heuristic.
898    ///
899    /// I.6  -  adaptive workgroup sizing reads this capability to pick
900    /// a workgroup multiple of the subgroup so threads don't straddle
901    /// subgroups. Typical devices expose 16, 32, or 64 lanes.
902    #[must_use]
903    fn subgroup_size(&self) -> Option<u32> {
904        None
905    }
906
907    /// Maximum size in bytes of a single storage buffer the backend
908    /// accepts. `0` means the backend has not reported a limit, not
909    /// "unlimited".
910    ///
911    /// Default: `0`.
912    #[must_use]
913    fn max_storage_buffer_bytes(&self) -> u64 {
914        0
915    }
916
917    /// Unified backend-neutral device profile.
918    ///
919    /// Shared planner code should prefer this single profile over reading
920    /// individual capability methods one by one. Concrete backends may
921    /// override it when they can report richer device facts such as shared
922    /// memory size or native lowering-strategy features.
923    #[must_use]
924    fn device_profile(&self) -> crate::DeviceProfile {
925        let max_workgroup_size = self.max_workgroup_size();
926        crate::DeviceProfile {
927            backend: self.id(),
928            supports_subgroup_ops: self.supports_subgroup_ops(),
929            supports_indirect_dispatch: self.supports_indirect_dispatch(),
930            supports_distributed_collectives: self.supports_distributed_collectives(),
931            supports_specialization_constants: false,
932            supports_f16: self.supports_f16(),
933            supports_bf16: self.supports_bf16(),
934            supports_trap_propagation: false,
935            supports_tensor_cores: self.supports_tensor_cores(),
936            has_mul_high: false,
937            has_dual_issue_fp32_int32: false,
938            has_subgroup_shuffle: self.supports_subgroup_ops(),
939            has_shared_memory: false,
940            max_native_int_width: 32,
941            max_workgroup_size,
942            max_invocations_per_workgroup: self.max_compute_invocations_per_workgroup(),
943            max_shared_memory_bytes: 0,
944            max_storage_buffer_binding_size: self.max_storage_buffer_bytes(),
945            subgroup_size: self.subgroup_size().unwrap_or(0),
946            compute_units: 0,
947            regs_per_thread_max: 0,
948            l1_cache_bytes: 0,
949            l2_cache_bytes: 0,
950            mem_bw_gbps: 0,
951            timing_quality: crate::DeviceTimingQuality::HostOnly,
952            supports_device_timestamps: false,
953            supports_hardware_counters: false,
954            ideal_unroll_depth: 0,
955            ideal_vector_pack_bits: 0,
956            ideal_workgroup_tile: [0, 0, 0],
957            shared_memory_bank_count: 0,
958            shared_memory_bank_width_bytes: 0,
959        }
960    }
961
962    // ---------------------------------------------------------------
963    // Lifecycle hooks (defaulted, override as needed).
964    //
965    // These let a backend warm caches, flush pending work, recover from
966    // device loss, or tear down cleanly. Every hook defaults to a
967    // no-op-or-structured-error, so existing impls do not have to add
968    // any code.
969    // ---------------------------------------------------------------
970
971    /// Pre-dispatch warmup. Called before the first dispatch on a new
972    /// program so the backend can warm caches, compile ahead-of-time, or
973    /// acquire a device handle without paying that cost on the hot path.
974    ///
975    /// Default: no-op `Ok(())`.
976    ///
977    /// # Errors
978    ///
979    /// Returns [`BackendError`] if warmup cannot complete.
980    fn prepare(&self) -> Result<(), BackendError> {
981        Ok(())
982    }
983
984    /// Flush any queued work to the device and wait for it to complete.
985    ///
986    /// Useful before tearing down a context or before reading back data
987    /// that was produced by the last asynchronous dispatch.
988    ///
989    /// Default: no-op `Ok(())`  -  backends that do not queue work
990    /// implicitly satisfy flush.
991    ///
992    /// # Errors
993    ///
994    /// Returns [`BackendError`] on device failure.
995    fn flush(&self) -> Result<(), BackendError> {
996        Ok(())
997    }
998
999    /// Release device resources held by this backend. After `shutdown`
1000    /// returns the backend is in an unspecified state and may not be
1001    /// used for further dispatches.
1002    ///
1003    /// Default: no-op `Ok(())`.
1004    ///
1005    /// # Errors
1006    ///
1007    /// Returns [`BackendError`] on device failure during teardown.
1008    fn shutdown(&self) -> Result<(), BackendError> {
1009        Ok(())
1010    }
1011
1012    /// Probe whether the underlying device has been lost since the last
1013    /// successful dispatch.
1014    ///
1015    /// Default: `false` (assume healthy  -  backends that have no
1016    /// device-loss story do not need to probe).
1017    #[must_use]
1018    fn device_lost(&self) -> bool {
1019        false
1020    }
1021
1022    /// Attempt to recover from device loss by reacquiring the underlying
1023    /// device and invalidating pipeline caches.
1024    ///
1025    /// Default: returns an `UnsupportedFeature` error  -  recovery must be
1026    /// opt-in, because a backend that silently re-acquires without
1027    /// notifying the caller is a correctness hazard.
1028    ///
1029    /// # Errors
1030    ///
1031    /// Returns [`BackendError::UnsupportedFeature`] by default. Backends
1032    /// that implement recovery return any error encountered during
1033    /// re-acquisition.
1034    fn try_recover(&self) -> Result<(), BackendError> {
1035        Err(BackendError::UnsupportedFeature {
1036            name: "device recovery".to_string(),
1037            backend: self.id().to_string(),
1038        })
1039    }
1040
1041    /// Allocate a backend-owned device buffer of `byte_len` bytes.
1042    ///
1043    /// The returned [`DeviceBuffer`] handle is only meaningful to the
1044    /// backend that produced it. Backends that have not opted in return
1045    /// [`BackendError::UnsupportedFeature`] with the
1046    /// `DEVICE_BUFFER_FEATURE` name; production callers that require
1047    /// resident-buffer performance must treat that as a hard capability
1048    /// failure rather than silently routing through host `Vec<u8>`
1049    /// dispatch. Real device backends (cuda/wgpu/spirv) override this to
1050    /// wrap their concrete handle (for example, a vendor device allocation,
1051    /// vulkan buffer) in a `DeviceBuffer` impl.
1052    ///
1053    /// See `crate::backend::device_buffer` for the substrate.
1054    ///
1055    /// # Errors
1056    ///
1057    /// Returns [`BackendError::UnsupportedFeature`] when the backend
1058    /// has not yet implemented persistent device-buffer allocation.
1059    fn allocate_device_buffer(
1060        &self,
1061        _byte_len: usize,
1062    ) -> Result<Box<dyn DeviceBuffer>, BackendError> {
1063        Err(unsupported_device_buffer(self.id()))
1064    }
1065
1066    /// Upload host bytes into a previously-allocated device buffer.
1067    ///
1068    /// # Errors
1069    ///
1070    /// Returns [`BackendError`] when the buffer was not allocated by this
1071    /// backend, the byte length does not match the allocation, or the
1072    /// backend has not opted in to device-buffer dispatch.
1073    fn upload_device_buffer(
1074        &self,
1075        _buffer: &mut dyn DeviceBuffer,
1076        _bytes: &[u8],
1077    ) -> Result<(), BackendError> {
1078        Err(unsupported_device_buffer(self.id()))
1079    }
1080
1081    /// Download bytes from a device buffer back to a host `Vec<u8>`.
1082    ///
1083    /// # Errors
1084    ///
1085    /// Returns [`BackendError`] when the buffer was not allocated by this
1086    /// backend or the backend has not opted in to device-buffer dispatch.
1087    fn download_device_buffer(&self, _buffer: &dyn DeviceBuffer) -> Result<Vec<u8>, BackendError> {
1088        Err(unsupported_device_buffer(self.id()))
1089    }
1090
1091    /// Free a device buffer previously returned by
1092    /// [`Self::allocate_device_buffer`]. Explicit-free is required
1093    /// because the substrate does not assume reference-counted backend
1094    /// handles; consumers are responsible for calling this when done.
1095    ///
1096    /// # Errors
1097    ///
1098    /// Returns [`BackendError`] when the buffer was not allocated by this
1099    /// backend or the underlying free fails.
1100    fn free_device_buffer(&self, _buffer: Box<dyn DeviceBuffer>) -> Result<(), BackendError> {
1101        Err(unsupported_device_buffer(self.id()))
1102    }
1103
1104    /// Dispatch a Program with backend-owned device buffers as inputs and
1105    /// outputs.
1106    ///
1107    /// Backends that have implemented [`Self::allocate_device_buffer`]
1108    /// override this method to bind their concrete buffer type without
1109    /// the host upload/download round trip the legacy `dispatch` API
1110    /// requires. Backends that have not opted in return
1111    /// [`BackendError::UnsupportedFeature`]. Callers that choose this
1112    /// API are asking for resident-buffer execution; falling back to
1113    /// host-buffer dispatch would hide the copy cost and violate the
1114    /// performance contract.
1115    ///
1116    /// # Errors
1117    ///
1118    /// Returns [`BackendError::UnsupportedFeature`] by default. Real
1119    /// implementations return any error encountered during dispatch.
1120    fn dispatch_with_device_buffers(
1121        &self,
1122        _program: &Program,
1123        _inputs: &[&dyn DeviceBuffer],
1124        _outputs: &mut [&mut dyn DeviceBuffer],
1125        _config: &DispatchConfig,
1126    ) -> Result<(), BackendError> {
1127        Err(unsupported_device_buffer(self.id()))
1128    }
1129}
1130
1131fn elapsed_resident_sequence_wall_ns(started: std::time::Instant) -> Result<u64, BackendError> {
1132    u64::try_from(started.elapsed().as_nanos()).map_err(|error| BackendError::InvalidProgram {
1133        fix: format!(
1134            "Fix: resident sequence wall timing cannot fit u64 nanoseconds: {error}. Split telemetry windows or report per-step timing."
1135        ),
1136    })
1137}
1138
1139fn sum_optional_timing(
1140    accumulator: Option<u64>,
1141    next: Option<u64>,
1142    field: &'static str,
1143) -> Result<Option<u64>, BackendError> {
1144    match (accumulator, next) {
1145        (Some(left), Some(right)) => Ok(Some(left.checked_add(right).ok_or_else(|| {
1146            BackendError::InvalidProgram {
1147                fix: format!(
1148                    "Fix: resident sequence {field} overflowed u64 nanoseconds. Split telemetry windows or report per-step timing instead of silently clamping."
1149                ),
1150            }
1151        })?)),
1152        _ => Ok(None),
1153    }
1154}
1155
1156#[cfg(test)]
1157
1158mod tests {
1159    use super::*;
1160    use std::sync::atomic::{AtomicUsize, Ordering};
1161
1162    struct TelemetryBackend;
1163
1164    impl private::Sealed for TelemetryBackend {}
1165
1166    impl VyreBackend for TelemetryBackend {
1167        fn id(&self) -> &'static str {
1168            "telemetry-test"
1169        }
1170
1171        fn dispatch(
1172            &self,
1173            _program: &Program,
1174            _inputs: &[Vec<u8>],
1175            _config: &DispatchConfig,
1176        ) -> Result<Vec<Vec<u8>>, BackendError> {
1177            Ok(vec![vec![1, 2], vec![3, 4]])
1178        }
1179    }
1180
1181    struct SequenceTimingBackend {
1182        dispatches: AtomicUsize,
1183    }
1184
1185    impl private::Sealed for SequenceTimingBackend {}
1186
1187    impl VyreBackend for SequenceTimingBackend {
1188        fn id(&self) -> &'static str {
1189            "sequence-timing-test"
1190        }
1191
1192        fn dispatch(
1193            &self,
1194            _program: &Program,
1195            _inputs: &[Vec<u8>],
1196            _config: &DispatchConfig,
1197        ) -> Result<Vec<Vec<u8>>, BackendError> {
1198            Ok(Vec::new())
1199        }
1200
1201        fn dispatch_resident_timed(
1202            &self,
1203            _program: &Program,
1204            _resources: &[Resource],
1205            config: &DispatchConfig,
1206        ) -> Result<TimedDispatchResult, BackendError> {
1207            let index = self.dispatches.fetch_add(1, Ordering::SeqCst) as u64;
1208            assert_eq!(
1209                config.grid_override,
1210                Some([index as u32 + 1, 1, 1]),
1211                "Fix: default resident sequence timing must preserve each step's grid override."
1212            );
1213            Ok(TimedDispatchResult {
1214                outputs: Vec::new(),
1215                wall_ns: 10 + index,
1216                device_ns: Some(7 + index),
1217                enqueue_ns: Some(3 + index),
1218                wait_ns: Some(4 + index),
1219            })
1220        }
1221
1222        fn download_resident_ranges_into(
1223            &self,
1224            ranges: &[(&Resource, usize, usize)],
1225            outputs: &mut [&mut Vec<u8>],
1226        ) -> Result<(), BackendError> {
1227            assert_eq!(ranges.len(), outputs.len());
1228            for ((resource, offset, len), output) in ranges.iter().zip(outputs.iter_mut()) {
1229                let Resource::Resident(id) = resource else {
1230                    panic!("Fix: default timed resident sequence test expects resident resources.");
1231                };
1232                output.clear();
1233                output.extend_from_slice(&id.to_le_bytes());
1234                output.extend_from_slice(&(*offset as u64).to_le_bytes());
1235                output.extend_from_slice(&(*len as u64).to_le_bytes());
1236            }
1237            Ok(())
1238        }
1239    }
1240
1241    #[test]
1242    fn default_borrowed_into_dispatch_records_runtime_telemetry() {
1243        let _guard = crate::observability::audit_events_test_lock();
1244        let before = crate::observability::snapshot_dispatch_telemetry();
1245        let backend = TelemetryBackend;
1246        let mut outputs = vec![Vec::with_capacity(4), Vec::with_capacity(1)];
1247
1248        backend
1249            .dispatch_borrowed_into(
1250                &Program::empty(),
1251                &[&[9, 8, 7]],
1252                &DispatchConfig::default(),
1253                &mut outputs,
1254            )
1255            .expect("Fix: default borrowed-into dispatch must succeed");
1256
1257        let telemetry = crate::observability::snapshot_dispatch_telemetry();
1258        assert!(telemetry.launches >= before.launches + 1);
1259        assert!(telemetry.input_bytes >= before.input_bytes + 3);
1260        assert!(telemetry.output_bytes >= before.output_bytes + 4);
1261        assert!(telemetry.output_slots >= before.output_slots + 2);
1262        assert!(telemetry.output_slots_reused >= before.output_slots_reused + 1);
1263        assert!(telemetry.output_slots_moved >= before.output_slots_moved + 1);
1264        assert!(telemetry.output_slots_appended >= before.output_slots_appended);
1265    }
1266
1267    #[test]
1268    fn default_resident_sequence_timing_sums_step_device_times_and_reads_ranges() {
1269        let backend = SequenceTimingBackend {
1270            dispatches: AtomicUsize::new(0),
1271        };
1272        let program = Program::empty();
1273        let first_resources = [Resource::Resident(11)];
1274        let second_resources = [Resource::Resident(22)];
1275        let steps = [
1276            ResidentDispatchStep {
1277                program: &program,
1278                resources: &first_resources,
1279                grid_override: Some([1, 1, 1]),
1280                workgroup_override: None,
1281            },
1282            ResidentDispatchStep {
1283                program: &program,
1284                resources: &second_resources,
1285                grid_override: Some([2, 1, 1]),
1286                workgroup_override: None,
1287            },
1288        ];
1289        let read_resource = Resource::Resident(33);
1290        let reads = [ResidentReadRange {
1291            resource: &read_resource,
1292            byte_offset: 4,
1293            byte_len: 8,
1294        }];
1295        let mut output = Vec::new();
1296
1297        let timing = backend
1298            .dispatch_resident_sequence_read_ranges_timed_into(&steps, &reads, &mut [&mut output])
1299            .expect("Fix: default timed resident sequence must execute and read ranges.");
1300
1301        assert_eq!(backend.dispatches.load(Ordering::SeqCst), 2);
1302        assert_eq!(timing.device_ns, Some(15));
1303        assert_eq!(timing.enqueue_ns, Some(7));
1304        assert_eq!(timing.wait_ns, Some(9));
1305        assert!(timing.wall_ns > 0);
1306        assert_eq!(output.len(), 24);
1307        assert_eq!(u64::from_le_bytes(output[0..8].try_into().unwrap()), 33);
1308        assert_eq!(u64::from_le_bytes(output[8..16].try_into().unwrap()), 4);
1309        assert_eq!(u64::from_le_bytes(output[16..24].try_into().unwrap()), 8);
1310    }
1311
1312    #[test]
1313    fn default_dispatch_paths_use_shared_fallible_staging_and_checked_timing() {
1314        let backend_source = include_str!("vyre_backend.rs");
1315        let compiled_source = include_str!("compiled_pipeline.rs");
1316        let module_source = include_str!("../backend.rs");
1317
1318        assert!(
1319            module_source.contains("fn clone_borrowed_inputs_for_dispatch")
1320                && module_source.contains("fn reserve_batch_output_slots")
1321                && module_source.contains("fn checked_elapsed_wall_ns"),
1322            "Fix: backend defaults must share one fallible staging and checked timing contract."
1323        );
1324        for source in [backend_source, compiled_source] {
1325            let production = source
1326                .split("#[cfg(test)]")
1327                .next()
1328                .expect("Fix: backend source must contain production section before tests");
1329            assert!(
1330                production.contains("clone_borrowed_inputs_for_dispatch")
1331                    && production.contains("checked_elapsed_wall_ns")
1332                    && !production.contains(".as_nanos() as u64")
1333                    && !production.contains("inputs.iter().map(|input| (*input).to_vec()).collect()"),
1334                "Fix: inherited backend dispatch defaults must avoid infallible borrowed-input collection and lossy wall timing."
1335            );
1336        }
1337        let compiled_production = compiled_source
1338            .split("#[cfg(test)]")
1339            .next()
1340            .expect("Fix: compiled pipeline source must contain production section before tests");
1341        assert!(
1342            compiled_production.contains("reserved_batch_output_slots")
1343                && !compiled_production.contains("Vec::with_capacity(batches.len())"),
1344            "Fix: compiled-pipeline batch defaults must construct output slots through shared fallible staging."
1345        );
1346    }
1347}