vyre_driver/backend/vyre_backend.rs
1//! The frozen `VyreBackend` contract.
2
3use std::sync::Arc;
4
5use smallvec::SmallVec;
6use vyre_foundation::ir::Program;
7
8use crate::backend::{
9 device_buffer::unsupported_device_buffer, private, BackendError, CompiledPipeline,
10 DeviceBuffer, DispatchConfig, OutputBuffers, PendingDispatch, Resource, TimedDispatchResult,
11};
12
13/// One backend-resident program dispatch in an ordered sequence.
14pub struct ResidentDispatchStep<'a> {
15 /// Program to dispatch.
16 pub program: &'a Program,
17 /// Resident resources in binding order.
18 pub resources: &'a [Resource],
19 /// Optional CUDA/grid-style launch override.
20 pub grid_override: Option<[u32; 3]>,
21 /// Optional workgroup override. MUST be carried alongside `grid_override`:
22 /// a caller that sizes its grid for a specific workgroup (grid =
23 /// ceil(work / workgroup)) will under-cover the work if the step falls back
24 /// to a different default workgroup. `None` keeps the backend's resolved
25 /// default (correct only when `grid_override` is also `None`).
26 pub workgroup_override: Option<[u32; 3]>,
27}
28
29/// One compact byte range to read from a backend-resident resource.
30pub struct ResidentReadRange<'a> {
31 /// Resident resource to read.
32 pub resource: &'a Resource,
33 /// Byte offset inside the resident resource.
34 pub byte_offset: usize,
35 /// Number of bytes to read.
36 pub byte_len: usize,
37}
38
39/// Timing captured for an ordered resident dispatch sequence.
40#[derive(Clone, Debug, Default, Eq, PartialEq)]
41pub struct ResidentSequenceTiming {
42 /// Host-observed sequence duration including requested readbacks.
43 pub wall_ns: u64,
44 /// Device-observed elapsed dispatch time when the backend exposes timers.
45 pub device_ns: Option<u64>,
46 /// Host time spent enqueueing backend work before waiting.
47 pub enqueue_ns: Option<u64>,
48 /// Host time spent waiting for completion and collecting outputs.
49 pub wait_ns: Option<u64>,
50}
51
52/// The frozen contract between vyre and every execution backend.
53///
54/// A backend is a pure function from a validated `Program` and input buffers
55/// to output buffers. Implementations must be `Send + Sync`, deterministic
56/// for identical inputs, and byte-identical to the CPU reference on success.
57/// This trait is the keystone of the vyre abstraction thesis: frontends do
58/// not know which backend runs their IR, and backends do not know which
59/// frontend produced it.
60///
61/// # Examples
62///
63pub trait VyreBackend: private::Sealed + Send + Sync {
64 /// Stable backend identifier used for logging, certificates, and adapter selection.
65 ///
66 /// The identifier must be unique among all backends linked into the
67 /// current process. Conformance reports include this string so that
68 /// consumers know exactly which implementation was certified.
69 fn id(&self) -> &'static str;
70
71 /// Backend implementation version string used for certificates and
72 /// regression tracking.
73 ///
74 /// The default returns `"unspecified"`. Concrete backends should
75 /// override this with their crate version (e.g. `"0.4.0"`) so that
76 /// certificates can detect backend upgrades that may require re-cert.
77 fn version(&self) -> &'static str {
78 "unspecified"
79 }
80
81 /// Operation ids this backend can execute without further lowering.
82 fn supported_ops(&self) -> &std::collections::HashSet<vyre_foundation::ir::OpId> {
83 use crate::backend::validation::default_supported_ops;
84 default_supported_ops()
85 }
86
87 // Raw backend shader text is a concrete-driver implementation
88 // detail, not part of the substrate-neutral `VyreBackend`
89 // contract.
90
91 /// Executes the program with the given input buffers and returns the output buffers.
92 ///
93 /// On success the returned bytes must match the pure-Rust reference
94 /// implementation bit-for-bit. On failure the backend must return a
95 /// [`BackendError`] whose message contains an actionable `Fix: ` hint.
96 ///
97 /// # Examples
98 ///
99 /// ```no_run
100 /// use vyre::{Program, VyreBackend, DispatchConfig};
101 ///
102 /// # fn example(backend: &dyn VyreBackend, program: &Program) -> Result<Vec<Vec<u8>>, vyre::BackendError> {
103 /// let inputs = vec![vec![1u8, 2, 3]];
104 /// let config = DispatchConfig::default();
105 /// backend.dispatch(program, &inputs, &config)
106 /// # }
107 /// ```
108 ///
109 /// # Errors
110 ///
111 /// Returns [`BackendError`] when the backend cannot complete dispatch.
112 /// The error message always includes a `Fix: ` remediation section.
113 fn dispatch(
114 &self,
115 program: &Program,
116 inputs: &[Vec<u8>],
117 config: &DispatchConfig,
118 ) -> Result<Vec<Vec<u8>>, BackendError>;
119
120 /// Executes the program with borrowed input buffers.
121 ///
122 /// Backends may override this method to avoid staging borrowed bytes into
123 /// owned `Vec<u8>` buffers. The default is non-breaking: it performs one
124 /// owned vector allocation for the call and delegates to
125 /// [`VyreBackend::dispatch`].
126 ///
127 /// # Errors
128 ///
129 /// Returns [`BackendError`] when the backend cannot complete dispatch.
130 fn dispatch_borrowed(
131 &self,
132 program: &Program,
133 inputs: &[&[u8]],
134 config: &DispatchConfig,
135 ) -> Result<Vec<Vec<u8>>, BackendError> {
136 let owned =
137 crate::backend::clone_borrowed_inputs_for_dispatch(inputs, "backend input staging")?;
138 let outputs = self.dispatch(program, &owned, config)?;
139 crate::observability::record_dispatch_io(inputs, &outputs);
140 Ok(outputs)
141 }
142
143 /// Executes a borrowed-input dispatch and returns backend-owned timing.
144 ///
145 /// The default records host wall time and delegates to
146 /// [`VyreBackend::dispatch_borrowed`]. Device-specific backends override
147 /// this only inside their driver crates so benchmark crates never import
148 /// vendor APIs directly.
149 ///
150 /// # Errors
151 ///
152 /// Returns [`BackendError`] when the backend cannot complete dispatch.
153 fn dispatch_borrowed_timed(
154 &self,
155 program: &Program,
156 inputs: &[&[u8]],
157 config: &DispatchConfig,
158 ) -> Result<TimedDispatchResult, BackendError> {
159 let started = std::time::Instant::now();
160 let outputs = self.dispatch_borrowed(program, inputs, config)?;
161 Ok(TimedDispatchResult {
162 outputs,
163 wall_ns: crate::backend::checked_elapsed_wall_ns(started, "backend borrowed dispatch")?,
164 device_ns: None,
165 enqueue_ns: None,
166 wait_ns: None,
167 })
168 }
169
170 /// Executes the program with borrowed input buffers and writes outputs into
171 /// caller-owned storage.
172 ///
173 /// Backends may override this method to reuse output buffers across
174 /// dispatches. The default preserves the existing dispatch contract and
175 /// copies returned bytes into existing output slots where possible.
176 ///
177 /// # Errors
178 ///
179 /// Returns [`BackendError`] when the backend cannot complete dispatch.
180 fn dispatch_borrowed_into(
181 &self,
182 program: &Program,
183 inputs: &[&[u8]],
184 config: &DispatchConfig,
185 outputs: &mut OutputBuffers,
186 ) -> Result<(), BackendError> {
187 let result = self.dispatch_borrowed(program, inputs, config)?;
188 let stats = crate::backend::dispatch_result::replace_output_buffers_preserving_slots_with_memory_stats(
189 result,
190 outputs,
191 );
192 crate::observability::record_output_replacement_stats(stats);
193 Ok(())
194 }
195
196 /// Allocate a backend-resident buffer and return a stable resource handle.
197 ///
198 /// Backends that support resident resources override this method so callers
199 /// can keep hot inputs on the device without importing a concrete driver
200 /// crate. The returned [`Resource`] is only meaningful to the backend that
201 /// produced it.
202 ///
203 /// # Errors
204 ///
205 /// Returns [`BackendError`] when the backend cannot allocate a resident
206 /// resource of the requested size.
207 fn allocate_resident(&self, _byte_len: usize) -> Result<Resource, BackendError> {
208 Err(BackendError::UnsupportedFeature {
209 name: "resident buffer allocation".to_string(),
210 backend: self.id().to_string(),
211 })
212 }
213
214 /// Upload bytes into a backend-resident resource.
215 ///
216 /// # Errors
217 ///
218 /// Returns [`BackendError`] when the resource is not owned by this backend
219 /// or the byte length does not match the resident allocation.
220 fn upload_resident(&self, _resource: &Resource, _bytes: &[u8]) -> Result<(), BackendError> {
221 Err(BackendError::UnsupportedFeature {
222 name: "resident buffer upload".to_string(),
223 backend: self.id().to_string(),
224 })
225 }
226
227 /// Upload several backend-resident resources as one logical staging
228 /// operation.
229 ///
230 /// Backends that support resident graph/dataflow hot paths should override
231 /// this with a native batched transfer. The default fails loudly instead
232 /// of looping over [`VyreBackend::upload_resident`], because a hidden
233 /// per-buffer synchronization loop destroys the performance contract.
234 ///
235 /// # Errors
236 ///
237 /// Returns [`BackendError`] when the backend cannot batch resident uploads
238 /// or when any resource/byte length is invalid.
239 fn upload_resident_many(&self, _uploads: &[(&Resource, &[u8])]) -> Result<(), BackendError> {
240 Err(BackendError::UnsupportedFeature {
241 name: "resident buffer batch upload".to_string(),
242 backend: self.id().to_string(),
243 })
244 }
245
246 /// Upload bytes into a subrange of a backend-resident resource.
247 ///
248 /// This is the hot-loop path for reusable resident slots whose capacity is
249 /// larger than the current logical payload. Backends must not require the
250 /// upload length to equal the allocation length.
251 ///
252 /// # Errors
253 ///
254 /// Returns [`BackendError`] when ranged upload is unsupported, the resource
255 /// is not owned by this backend, or the destination range is out of bounds.
256 fn upload_resident_at(
257 &self,
258 _resource: &Resource,
259 _dst_offset_bytes: usize,
260 _bytes: &[u8],
261 ) -> Result<(), BackendError> {
262 Err(BackendError::UnsupportedFeature {
263 name: "resident buffer ranged upload".to_string(),
264 backend: self.id().to_string(),
265 })
266 }
267
268 /// Upload several resident subranges as one logical staging operation.
269 ///
270 /// The default fails loudly instead of looping over
271 /// [`VyreBackend::upload_resident_at`], because hidden per-range
272 /// synchronization breaks resident hot-loop performance.
273 ///
274 /// # Errors
275 ///
276 /// Returns [`BackendError`] when ranged batch upload is unsupported or when
277 /// any resource/range is invalid.
278 fn upload_resident_at_many(
279 &self,
280 _uploads: &[(&Resource, usize, &[u8])],
281 ) -> Result<(), BackendError> {
282 Err(BackendError::UnsupportedFeature {
283 name: "resident buffer ranged batch upload".to_string(),
284 backend: self.id().to_string(),
285 })
286 }
287
288 /// Download a backend-resident resource into a new host buffer.
289 ///
290 /// Prefer [`VyreBackend::download_resident_into`] in hot loops so repeated
291 /// validation does not allocate a fresh `Vec` for every readback.
292 ///
293 /// # Errors
294 ///
295 /// Returns [`BackendError`] when the backend cannot download resident
296 /// resources or when `resource` is not owned by this backend.
297 fn download_resident(&self, resource: &Resource) -> Result<Vec<u8>, BackendError> {
298 let mut bytes = Vec::new();
299 self.download_resident_into(resource, &mut bytes)?;
300 Ok(bytes)
301 }
302
303 /// Download a backend-resident resource into caller-owned storage.
304 ///
305 /// Implementations must clear and reuse `out`; hidden compatibility
306 /// allocation defeats resident hot-loop validation.
307 ///
308 /// # Errors
309 ///
310 /// Returns [`BackendError`] when resident download is unsupported or when
311 /// `resource` is not owned by this backend.
312 fn download_resident_into(
313 &self,
314 _resource: &Resource,
315 _out: &mut Vec<u8>,
316 ) -> Result<(), BackendError> {
317 Err(BackendError::UnsupportedFeature {
318 name: "resident buffer download".to_string(),
319 backend: self.id().to_string(),
320 })
321 }
322
323 /// Download a byte range from a backend-resident resource into a new host
324 /// buffer.
325 ///
326 /// Prefer [`VyreBackend::download_resident_range_into`] in hot loops.
327 ///
328 /// # Errors
329 ///
330 /// Returns [`BackendError`] when resident ranged download is unsupported,
331 /// the range is invalid, or `resource` is not owned by this backend.
332 fn download_resident_range(
333 &self,
334 resource: &Resource,
335 byte_offset: usize,
336 byte_len: usize,
337 ) -> Result<Vec<u8>, BackendError> {
338 let mut bytes = Vec::new();
339 bytes.try_reserve_exact(byte_len).map_err(|error| {
340 BackendError::InvalidProgram {
341 fix: format!(
342 "Fix: resident ranged download could not reserve {byte_len} output byte(s): {error}. Split the readback range before dispatch."
343 ),
344 }
345 })?;
346 self.download_resident_range_into(resource, byte_offset, byte_len, &mut bytes)?;
347 Ok(bytes)
348 }
349
350 /// Download a byte range from a backend-resident resource into
351 /// caller-owned storage.
352 ///
353 /// # Errors
354 ///
355 /// Returns [`BackendError`] when resident ranged download is unsupported,
356 /// the range is invalid, or `resource` is not owned by this backend.
357 fn download_resident_range_into(
358 &self,
359 _resource: &Resource,
360 _byte_offset: usize,
361 _byte_len: usize,
362 _out: &mut Vec<u8>,
363 ) -> Result<(), BackendError> {
364 Err(BackendError::UnsupportedFeature {
365 name: "resident buffer ranged download".to_string(),
366 backend: self.id().to_string(),
367 })
368 }
369
370 /// Download several byte ranges from backend-resident resources into
371 /// caller-owned storage as one logical readback operation.
372 ///
373 /// Backends with a real multi-readback path should override this to issue
374 /// all copies behind one backend synchronization boundary.
375 ///
376 /// # Errors
377 ///
378 /// Returns [`BackendError`] when counts do not match, any range is invalid,
379 /// or resident ranged download is unsupported.
380 fn download_resident_ranges_into(
381 &self,
382 ranges: &[(&Resource, usize, usize)],
383 outputs: &mut [&mut Vec<u8>],
384 ) -> Result<(), BackendError> {
385 if ranges.len() != outputs.len() {
386 return Err(BackendError::InvalidProgram {
387 fix: format!(
388 "Fix: resident ranged batch download expected matching range/output counts but got {} range(s) and {} output(s).",
389 ranges.len(),
390 outputs.len()
391 ),
392 });
393 }
394 for ((resource, byte_offset, byte_len), output) in ranges.iter().zip(outputs.iter_mut()) {
395 self.download_resident_range_into(resource, *byte_offset, *byte_len, output)?;
396 }
397 Ok(())
398 }
399
400 /// Free a backend-resident resource previously returned by
401 /// [`VyreBackend::allocate_resident`].
402 ///
403 /// # Errors
404 ///
405 /// Returns [`BackendError`] when the resource is unknown or still in use.
406 fn free_resident(&self, _resource: Resource) -> Result<(), BackendError> {
407 Err(BackendError::UnsupportedFeature {
408 name: "resident buffer free".to_string(),
409 backend: self.id().to_string(),
410 })
411 }
412
413 /// Dispatch using backend-resident resources and return backend-owned
414 /// timing.
415 ///
416 /// # Errors
417 ///
418 /// Returns [`BackendError`] when the backend does not support resident
419 /// dispatch or any resource is invalid for the program.
420 fn dispatch_resident_timed(
421 &self,
422 _program: &Program,
423 _resources: &[Resource],
424 _config: &DispatchConfig,
425 ) -> Result<TimedDispatchResult, BackendError> {
426 Err(BackendError::UnsupportedFeature {
427 name: "resident timed dispatch".to_string(),
428 backend: self.id().to_string(),
429 })
430 }
431
432 /// Dispatch an ordered sequence of resident-buffer programs and read
433 /// selected resident byte ranges into caller-owned storage.
434 ///
435 /// The default preserves correctness by dispatching each step through
436 /// [`VyreBackend::dispatch_resident_timed`] and then calling
437 /// [`VyreBackend::download_resident_ranges_into`]. CUDA overrides this to
438 /// enqueue the whole dependent chain plus D2H readbacks on one stream and
439 /// pay one host synchronization point.
440 ///
441 /// # Errors
442 ///
443 /// Returns [`BackendError`] when any step fails, when readback ranges are
444 /// invalid, or when the backend cannot perform resident dispatch/readback.
445 fn dispatch_resident_sequence_read_ranges_into(
446 &self,
447 steps: &[ResidentDispatchStep<'_>],
448 read_ranges: &[ResidentReadRange<'_>],
449 outputs: &mut [&mut Vec<u8>],
450 ) -> Result<(), BackendError> {
451 for step in steps {
452 let mut config = DispatchConfig::default();
453 config.grid_override = step.grid_override;
454 self.dispatch_resident_timed(step.program, step.resources, &config)?;
455 }
456 let ranges = read_ranges
457 .iter()
458 .map(|range| (range.resource, range.byte_offset, range.byte_len))
459 .collect::<SmallVec<[_; 8]>>();
460 self.download_resident_ranges_into(&ranges, outputs)
461 }
462
463 /// Timed variant of
464 /// [`VyreBackend::dispatch_resident_sequence_read_ranges_into`].
465 ///
466 /// The default preserves correctness by summing each step's resident
467 /// dispatch timing and then downloading the requested ranges. Backends with
468 /// a fused resident sequence path should override this to keep their
469 /// optimized stream/queue behavior while exposing device timing.
470 ///
471 /// # Errors
472 ///
473 /// Returns [`BackendError`] when any step fails, when readback ranges are
474 /// invalid, or when resident sequence timing overflows.
475 fn dispatch_resident_sequence_read_ranges_timed_into(
476 &self,
477 steps: &[ResidentDispatchStep<'_>],
478 read_ranges: &[ResidentReadRange<'_>],
479 outputs: &mut [&mut Vec<u8>],
480 ) -> Result<ResidentSequenceTiming, BackendError> {
481 let started = std::time::Instant::now();
482 let mut device_ns = Some(0_u64);
483 let mut enqueue_ns = Some(0_u64);
484 let mut wait_ns = Some(0_u64);
485 for step in steps {
486 let mut config = DispatchConfig::default();
487 config.grid_override = step.grid_override;
488 let timed = self.dispatch_resident_timed(step.program, step.resources, &config)?;
489 device_ns = sum_optional_timing(device_ns, timed.device_ns, "device timing")?;
490 enqueue_ns = sum_optional_timing(enqueue_ns, timed.enqueue_ns, "enqueue timing")?;
491 wait_ns = sum_optional_timing(wait_ns, timed.wait_ns, "wait timing")?;
492 }
493 let ranges = read_ranges
494 .iter()
495 .map(|range| (range.resource, range.byte_offset, range.byte_len))
496 .collect::<SmallVec<[_; 8]>>();
497 self.download_resident_ranges_into(&ranges, outputs)?;
498 Ok(ResidentSequenceTiming {
499 wall_ns: elapsed_resident_sequence_wall_ns(started)?,
500 device_ns,
501 enqueue_ns,
502 wait_ns,
503 })
504 }
505
506 /// Dispatch a resident prefix, repeat a resident sub-sequence, and read
507 /// selected resident byte ranges into caller-owned storage.
508 ///
509 /// This is the fixed-point hot-path contract: dataflow clients can express
510 /// a repeated kernel group without allocating one host sequence entry per
511 /// iteration. Backends that understand repetition should override this to
512 /// keep launch preparation and parameter upload sublinear in
513 /// `repeat_count`.
514 ///
515 /// # Errors
516 ///
517 /// Returns [`BackendError`] when any step fails, when readback ranges are
518 /// invalid, or when the backend cannot perform resident dispatch/readback.
519 fn dispatch_resident_repeated_sequence_read_ranges_into(
520 &self,
521 prefix_steps: &[ResidentDispatchStep<'_>],
522 repeated_steps: &[ResidentDispatchStep<'_>],
523 repeat_count: u32,
524 read_ranges: &[ResidentReadRange<'_>],
525 outputs: &mut [&mut Vec<u8>],
526 ) -> Result<(), BackendError> {
527 for step in prefix_steps {
528 let mut config = DispatchConfig::default();
529 config.grid_override = step.grid_override;
530 self.dispatch_resident_timed(step.program, step.resources, &config)?;
531 }
532 for _ in 0..repeat_count {
533 for step in repeated_steps {
534 let mut config = DispatchConfig::default();
535 config.grid_override = step.grid_override;
536 self.dispatch_resident_timed(step.program, step.resources, &config)?;
537 }
538 }
539 let ranges = read_ranges
540 .iter()
541 .map(|range| (range.resource, range.byte_offset, range.byte_len))
542 .collect::<SmallVec<[_; 8]>>();
543 self.download_resident_ranges_into(&ranges, outputs)
544 }
545
546 /// Optional pre-compilation hook for the pipeline-mode API.
547 ///
548 /// Default returns `Ok(None)` - the framework wraps in a passthrough
549 /// pipeline whose `dispatch` calls back into [`VyreBackend::dispatch`]
550 /// every time. Backends that genuinely cache compiled state (compute
551 /// pipeline, bind-group layout, lowered shader text) override this and
552 /// return `Ok(Some(...))` so repeated dispatches skip the compilation
553 /// overhead.
554 ///
555 /// The returned pipeline MUST be bit-identical to repeated
556 /// `dispatch(program, inputs, config)` for the program it was compiled
557 /// from. The cache key is the backend's responsibility - the framework
558 /// does not deduplicate compile calls.
559 ///
560 /// Implementing this method is the P-6 contract from
561 /// `docs/audits/ROADMAP_PERFORMANCE.md`: "compile target-text + pipeline +
562 /// bind-group-layout once; dispatch repeatedly with different inputs."
563 ///
564 /// # Errors
565 ///
566 /// Returns [`BackendError`] when the backend cannot complete the
567 /// pre-compilation. Callers should treat this as fatal for the program
568 /// (the program will not dispatch successfully via any path).
569 fn compile_native(
570 &self,
571 _program: &Program,
572 _config: &DispatchConfig,
573 ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
574 Ok(None)
575 }
576
577 /// Optional pre-compilation hook for callers that already own a shared
578 /// program allocation.
579 ///
580 /// Backends that store the program inside the compiled pipeline should
581 /// override this method and keep the supplied [`Arc<Program>`] instead of
582 /// cloning the IR. The default preserves the older borrowed-program hook
583 /// for backends that only inspect the program while compiling.
584 ///
585 /// # Errors
586 ///
587 /// Returns [`BackendError`] when backend-native compilation fails.
588 fn compile_native_shared(
589 &self,
590 program: Arc<Program>,
591 config: &DispatchConfig,
592 ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
593 self.compile_native(&program, config)
594 }
595
596 /// Optional compiled-pipeline cache counters for compile telemetry.
597 ///
598 /// Return `None` unless the backend can report real cache hits and misses.
599 fn pipeline_cache_snapshot(&self) -> Option<crate::pipeline::PipelineCacheSnapshot> {
600 None
601 }
602
603 /// Optional backend-specific numeric telemetry for release evidence.
604 ///
605 /// Return an empty vector unless the backend can report real counters.
606 /// Metric names must be stable ASCII identifiers suitable for JSON and
607 /// Prometheus-style export.
608 fn backend_metric_snapshot(&self) -> Vec<(&'static str, u64)> {
609 Vec::new()
610 }
611
612 /// Non-blocking dispatch primitive.
613 ///
614 /// Returns a [`PendingDispatch`] handle immediately; the caller
615 /// polls via [`PendingDispatch::is_ready`] and consumes the result
616 /// via [`PendingDispatch::await_result`]. Backends that genuinely
617 /// pipeline dispatches override this so N concurrent dispatches
618 /// do not serialize on the host.
619 ///
620 /// Default: run the synchronous [`VyreBackend::dispatch`] path and
621 /// wrap the result in a trivially-ready handle. This keeps every
622 /// backend useful from the async API without forcing an async
623 /// rewrite.
624 ///
625 /// # Errors
626 ///
627 /// Returns [`BackendError`] if the dispatch cannot start. Errors
628 /// that surface only during GPU execution come back through
629 /// [`PendingDispatch::await_result`], not from this call.
630 fn dispatch_async(
631 &self,
632 program: &Program,
633 inputs: &[Vec<u8>],
634 config: &DispatchConfig,
635 ) -> Result<Box<dyn PendingDispatch>, BackendError> {
636 let outputs = self.dispatch(program, inputs, config)?;
637 Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
638 outputs,
639 }))
640 }
641
642 /// Non-blocking dispatch with borrowed input buffers.
643 ///
644 /// Backends that record GPU commands synchronously before returning can
645 /// override this to avoid cloning input buffers just to create a pending
646 /// handle. The returned [`PendingDispatch`] must not borrow from `inputs`.
647 ///
648 /// # Errors
649 ///
650 /// Returns [`BackendError`] if the dispatch cannot start.
651 fn dispatch_borrowed_async(
652 &self,
653 program: &Program,
654 inputs: &[&[u8]],
655 config: &DispatchConfig,
656 ) -> Result<Box<dyn PendingDispatch>, BackendError> {
657 let outputs = self.dispatch_borrowed(program, inputs, config)?;
658 Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
659 outputs,
660 }))
661 }
662
663 // ---------------------------------------------------------------
664 // Capability queries (all default to conservative "no" / minimal).
665 //
666 // These are the stable capability surface. Additional backends implement
667 // this trait by default-inheriting every capability below and OVERRIDING
668 // only the ones where they are more capable than the conservative floor.
669 // This means adding a backend is strictly additive - no existing
670 // backend impl has to change when a new capability query is added.
671 //
672 // Backends MUST report HONESTLY. Returning `true` from a capability
673 // query is a promise the lowering path emits the corresponding
674 // intrinsic and the adapter supports it. "Supported but broken" is a
675 // LAW 9 evasion (see AGENTS.md). If the feature bit is set on the
676 // device but the lowering emits a slower emulation sequence, the
677 // answer is `false` until the native lowering catches up.
678 // ---------------------------------------------------------------
679
680 /// Whether this backend's lowering path emits subgroup / wave
681 /// intrinsics AND the current adapter exposes them.
682 ///
683 /// Default: `false` (conservative - assumes no native subgroup lowering).
684 #[must_use]
685 fn supports_subgroup_ops(&self) -> bool {
686 false
687 }
688
689 /// Whether this backend lowers IEEE 754 binary16 (`DataType::F16`)
690 /// natively rather than emulating through `f32`.
691 ///
692 /// Default: `false`.
693 #[must_use]
694 fn supports_f16(&self) -> bool {
695 false
696 }
697
698 /// Whether this backend lowers bfloat16 (`DataType::BF16`) natively.
699 ///
700 /// Default: `false`.
701 #[must_use]
702 fn supports_bf16(&self) -> bool {
703 false
704 }
705
706 /// Whether this backend emits tensor-core / matrix-engine intrinsics
707 /// for supported tensor shapes.
708 ///
709 /// Default: `false`.
710 #[must_use]
711 fn supports_tensor_cores(&self) -> bool {
712 false
713 }
714
715 /// Whether this backend overlaps copies and compute via independent
716 /// queues or async engines.
717 ///
718 /// Default: `false` (host serializes copy ↔ compute).
719 #[must_use]
720 fn supports_async_compute(&self) -> bool {
721 false
722 }
723
724 /// Whether this backend supports indirect dispatch
725 /// (`Node::IndirectDispatch`).
726 ///
727 /// Default: `false`.
728 #[must_use]
729 fn supports_indirect_dispatch(&self) -> bool {
730 false
731 }
732
733 /// Whether this backend supports speculative dispatch - a fused
734 /// prefilter + confirmer kernel with commit-gated output and a
735 /// counter tail read back by the host.
736 ///
737 /// Default: `false`.
738 #[must_use]
739 fn supports_speculation(&self) -> bool {
740 false
741 }
742
743 /// Whether this backend supports device-side persistent-thread
744 /// dispatch (a long-running kernel that polls a work queue).
745 ///
746 /// Default: `false`.
747 #[must_use]
748 fn supports_persistent_thread_dispatch(&self) -> bool {
749 false
750 }
751
752 /// Whether this backend can satisfy `Node::Barrier { ordering:
753 /// MemoryOrdering::GridSync }` inside a single dispatch - i.e.
754 /// every thread in the entire grid waits at the barrier and
755 /// every prior write is globally visible afterwards. Backends
756 /// that lack a native grid barrier (workgroup-only fences) must
757 /// return `false`; registration-based dispatch may lower a
758 /// `GridSync` barrier to a host-orchestrated kernel split only
759 /// when [`VyreBackend::allows_host_grid_sync_split`] also returns
760 /// `true`.
761 ///
762 /// Backends with cooperative whole-grid launch support can return
763 /// `true`; backends limited to workgroup-local synchronization return
764 /// `false` until the target exposes a compatible grid-barrier primitive.
765 ///
766 /// Default: `false`.
767 #[must_use]
768 fn supports_grid_sync(&self) -> bool {
769 false
770 }
771
772 /// Whether a native cooperative grid-sync launch of `program` with these
773 /// `inputs` and `config` can be made fully resident on this device.
774 ///
775 /// [`VyreBackend::supports_grid_sync`] reports that native lowering is
776 /// *available*; this reports whether it *fits* for a specific dispatch. A
777 /// cooperative launch requires every block co-resident, so a grid whose
778 /// block count exceeds the device's cooperative residency cannot run
779 /// natively and must route to the resident-fixpoint or host-split path.
780 /// Orchestrators call this to choose the native route only when it fits,
781 /// avoiding a wasted allocate/upload that would otherwise end in
782 /// [`crate::ErrorCode::CooperativeResidencyExceeded`].
783 ///
784 /// Default: `Ok(false)` (no native cooperative launch). Backends that lower
785 /// grid sync override this with the real residency check. Returns `Ok(false)`
786 /// — not an error — when the program carries no grid-sync barrier, since
787 /// there is then nothing to launch cooperatively.
788 ///
789 /// # Errors
790 ///
791 /// Returns [`BackendError`] if the launch geometry cannot be computed for
792 /// the program/inputs (a structurally invalid dispatch).
793 fn cooperative_grid_sync_fits(
794 &self,
795 _program: &Program,
796 _inputs: &[&[u8]],
797 _config: &DispatchConfig,
798 ) -> Result<bool, BackendError> {
799 Ok(false)
800 }
801
802 /// Whether the shared registry wrapper may emulate whole-grid
803 /// synchronization for this backend by splitting one program into
804 /// multiple host-dispatched kernels.
805 ///
806 /// This exists separately from [`VyreBackend::supports_grid_sync`]
807 /// because a backend can intentionally reject hidden host
808 /// orchestration while native cooperative-grid lowering is absent.
809 /// CUDA uses that policy in the release path so missing native
810 /// grid-barrier lowering is surfaced as an unsupported feature
811 /// instead of silently becoming a slower multi-launch path.
812 ///
813 /// Default: `true` to preserve existing behavior for simple
814 /// backends that intentionally rely on shared split lowering.
815 #[must_use]
816 fn allows_host_grid_sync_split(&self) -> bool {
817 true
818 }
819
820 /// Whether this backend implements the resident half of the contract
821 /// (`allocate_resident` / `upload_resident` / `dispatch_resident_timed` /
822 /// `dispatch_resident_repeated_sequence_read_ranges_into` /
823 /// `download_resident_*` / `free_resident`) well enough to run a
824 /// device-resident dispatch sequence.
825 ///
826 /// Consumers use this to choose
827 /// [`crate::grid_sync::dispatch_resident_grid_sync_fixpoint_into`] (which
828 /// keeps live buffers device-resident across every grid-sync segment and
829 /// fixpoint pass) over the host-orchestrated
830 /// [`crate::grid_sync::dispatch_with_grid_sync_split_into`] (which
831 /// round-trips every live buffer host↔device between segments). Both are
832 /// correct; the choice is a performance route on a probed capability, not
833 /// a silent failure fallback.
834 ///
835 /// Default: `false`. Backends that implement resident dispatch override
836 /// this to `true`.
837 #[must_use]
838 fn supports_resident_dispatch(&self) -> bool {
839 false
840 }
841
842 /// Whether this backend partitions a program across more than one
843 /// physical device / node.
844 ///
845 /// Default: `false` (single-device execution).
846 #[must_use]
847 fn is_distributed(&self) -> bool {
848 false
849 }
850
851 /// Whether this backend lowers distributed collective communication
852 /// nodes (`AllReduce`, `AllGather`, `ReduceScatter`, `Broadcast`).
853 ///
854 /// This is intentionally separate from [`VyreBackend::is_distributed`]:
855 /// a backend may partition work across devices without yet exposing a
856 /// correct collective transport/lowering stack. Default: `false`.
857 #[must_use]
858 fn supports_distributed_collectives(&self) -> bool {
859 false
860 }
861
862 /// Maximum supported workgroup size per axis `[x, y, z]`.
863 ///
864 /// Default: `[1, 1, 1]` (scalar dispatch - a backend that has not
865 /// reported a real limit cannot be trusted to execute parallel
866 /// workgroups).
867 #[must_use]
868 fn max_workgroup_size(&self) -> [u32; 3] {
869 [1, 1, 1]
870 }
871
872 /// Maximum number of compute workgroups the backend can launch in one
873 /// dispatch dimension.
874 ///
875 /// Default: `1`, which is safe for scalar/reference backends but must be
876 /// overridden by real GPU backends so schedulers do not under-launch.
877 #[must_use]
878 fn max_compute_workgroups_per_dimension(&self) -> u32 {
879 1
880 }
881
882 /// Maximum total invocations allowed in a single workgroup.
883 ///
884 /// Default derives from [`max_workgroup_size`](Self::max_workgroup_size)
885 /// and fails loudly if a backend reports an unrepresentable product.
886 #[must_use]
887 fn max_compute_invocations_per_workgroup(&self) -> u32 {
888 let [x, y, z] = self.max_workgroup_size();
889 let invocations = u128::from(x) * u128::from(y) * u128::from(z);
890 u32::try_from(invocations).unwrap_or(u32::MAX)
891 }
892
893 /// Native subgroup size for the backing device when the backend
894 /// knows it. Returning
895 /// `None` tells the dispatch planner the backend can't report a
896 /// subgroup width - the planner falls back to `max_workgroup_size`
897 /// for its sizing heuristic.
898 ///
899 /// I.6 - adaptive workgroup sizing reads this capability to pick
900 /// a workgroup multiple of the subgroup so threads don't straddle
901 /// subgroups. Typical devices expose 16, 32, or 64 lanes.
902 #[must_use]
903 fn subgroup_size(&self) -> Option<u32> {
904 None
905 }
906
907 /// Maximum size in bytes of a single storage buffer the backend
908 /// accepts. `0` means the backend has not reported a limit, not
909 /// "unlimited".
910 ///
911 /// Default: `0`.
912 #[must_use]
913 fn max_storage_buffer_bytes(&self) -> u64 {
914 0
915 }
916
917 /// Unified backend-neutral device profile.
918 ///
919 /// Shared planner code should prefer this single profile over reading
920 /// individual capability methods one by one. Concrete backends may
921 /// override it when they can report richer device facts such as shared
922 /// memory size or native lowering-strategy features.
923 #[must_use]
924 fn device_profile(&self) -> crate::DeviceProfile {
925 let max_workgroup_size = self.max_workgroup_size();
926 crate::DeviceProfile {
927 backend: self.id(),
928 supports_subgroup_ops: self.supports_subgroup_ops(),
929 supports_indirect_dispatch: self.supports_indirect_dispatch(),
930 supports_distributed_collectives: self.supports_distributed_collectives(),
931 supports_specialization_constants: false,
932 supports_f16: self.supports_f16(),
933 supports_bf16: self.supports_bf16(),
934 supports_trap_propagation: false,
935 supports_tensor_cores: self.supports_tensor_cores(),
936 has_mul_high: false,
937 has_dual_issue_fp32_int32: false,
938 has_subgroup_shuffle: self.supports_subgroup_ops(),
939 has_shared_memory: false,
940 max_native_int_width: 32,
941 max_workgroup_size,
942 max_invocations_per_workgroup: self.max_compute_invocations_per_workgroup(),
943 max_shared_memory_bytes: 0,
944 max_storage_buffer_binding_size: self.max_storage_buffer_bytes(),
945 subgroup_size: self.subgroup_size().unwrap_or(0),
946 compute_units: 0,
947 regs_per_thread_max: 0,
948 l1_cache_bytes: 0,
949 l2_cache_bytes: 0,
950 mem_bw_gbps: 0,
951 timing_quality: crate::DeviceTimingQuality::HostOnly,
952 supports_device_timestamps: false,
953 supports_hardware_counters: false,
954 ideal_unroll_depth: 0,
955 ideal_vector_pack_bits: 0,
956 ideal_workgroup_tile: [0, 0, 0],
957 shared_memory_bank_count: 0,
958 shared_memory_bank_width_bytes: 0,
959 }
960 }
961
962 // ---------------------------------------------------------------
963 // Lifecycle hooks (defaulted, override as needed).
964 //
965 // These let a backend warm caches, flush pending work, recover from
966 // device loss, or tear down cleanly. Every hook defaults to a
967 // no-op-or-structured-error, so existing impls do not have to add
968 // any code.
969 // ---------------------------------------------------------------
970
971 /// Pre-dispatch warmup. Called before the first dispatch on a new
972 /// program so the backend can warm caches, compile ahead-of-time, or
973 /// acquire a device handle without paying that cost on the hot path.
974 ///
975 /// Default: no-op `Ok(())`.
976 ///
977 /// # Errors
978 ///
979 /// Returns [`BackendError`] if warmup cannot complete.
980 fn prepare(&self) -> Result<(), BackendError> {
981 Ok(())
982 }
983
984 /// Flush any queued work to the device and wait for it to complete.
985 ///
986 /// Useful before tearing down a context or before reading back data
987 /// that was produced by the last asynchronous dispatch.
988 ///
989 /// Default: no-op `Ok(())` - backends that do not queue work
990 /// implicitly satisfy flush.
991 ///
992 /// # Errors
993 ///
994 /// Returns [`BackendError`] on device failure.
995 fn flush(&self) -> Result<(), BackendError> {
996 Ok(())
997 }
998
999 /// Release device resources held by this backend. After `shutdown`
1000 /// returns the backend is in an unspecified state and may not be
1001 /// used for further dispatches.
1002 ///
1003 /// Default: no-op `Ok(())`.
1004 ///
1005 /// # Errors
1006 ///
1007 /// Returns [`BackendError`] on device failure during teardown.
1008 fn shutdown(&self) -> Result<(), BackendError> {
1009 Ok(())
1010 }
1011
1012 /// Probe whether the underlying device has been lost since the last
1013 /// successful dispatch.
1014 ///
1015 /// Default: `false` (assume healthy - backends that have no
1016 /// device-loss story do not need to probe).
1017 #[must_use]
1018 fn device_lost(&self) -> bool {
1019 false
1020 }
1021
1022 /// Attempt to recover from device loss by reacquiring the underlying
1023 /// device and invalidating pipeline caches.
1024 ///
1025 /// Default: returns an `UnsupportedFeature` error - recovery must be
1026 /// opt-in, because a backend that silently re-acquires without
1027 /// notifying the caller is a correctness hazard.
1028 ///
1029 /// # Errors
1030 ///
1031 /// Returns [`BackendError::UnsupportedFeature`] by default. Backends
1032 /// that implement recovery return any error encountered during
1033 /// re-acquisition.
1034 fn try_recover(&self) -> Result<(), BackendError> {
1035 Err(BackendError::UnsupportedFeature {
1036 name: "device recovery".to_string(),
1037 backend: self.id().to_string(),
1038 })
1039 }
1040
1041 /// Allocate a backend-owned device buffer of `byte_len` bytes.
1042 ///
1043 /// The returned [`DeviceBuffer`] handle is only meaningful to the
1044 /// backend that produced it. Backends that have not opted in return
1045 /// [`BackendError::UnsupportedFeature`] with the
1046 /// `DEVICE_BUFFER_FEATURE` name; production callers that require
1047 /// resident-buffer performance must treat that as a hard capability
1048 /// failure rather than silently routing through host `Vec<u8>`
1049 /// dispatch. Real device backends (cuda/wgpu/spirv) override this to
1050 /// wrap their concrete handle (for example, a vendor device allocation,
1051 /// vulkan buffer) in a `DeviceBuffer` impl.
1052 ///
1053 /// See `crate::backend::device_buffer` for the substrate.
1054 ///
1055 /// # Errors
1056 ///
1057 /// Returns [`BackendError::UnsupportedFeature`] when the backend
1058 /// has not yet implemented persistent device-buffer allocation.
1059 fn allocate_device_buffer(
1060 &self,
1061 _byte_len: usize,
1062 ) -> Result<Box<dyn DeviceBuffer>, BackendError> {
1063 Err(unsupported_device_buffer(self.id()))
1064 }
1065
1066 /// Upload host bytes into a previously-allocated device buffer.
1067 ///
1068 /// # Errors
1069 ///
1070 /// Returns [`BackendError`] when the buffer was not allocated by this
1071 /// backend, the byte length does not match the allocation, or the
1072 /// backend has not opted in to device-buffer dispatch.
1073 fn upload_device_buffer(
1074 &self,
1075 _buffer: &mut dyn DeviceBuffer,
1076 _bytes: &[u8],
1077 ) -> Result<(), BackendError> {
1078 Err(unsupported_device_buffer(self.id()))
1079 }
1080
1081 /// Download bytes from a device buffer back to a host `Vec<u8>`.
1082 ///
1083 /// # Errors
1084 ///
1085 /// Returns [`BackendError`] when the buffer was not allocated by this
1086 /// backend or the backend has not opted in to device-buffer dispatch.
1087 fn download_device_buffer(&self, _buffer: &dyn DeviceBuffer) -> Result<Vec<u8>, BackendError> {
1088 Err(unsupported_device_buffer(self.id()))
1089 }
1090
1091 /// Free a device buffer previously returned by
1092 /// [`Self::allocate_device_buffer`]. Explicit-free is required
1093 /// because the substrate does not assume reference-counted backend
1094 /// handles; consumers are responsible for calling this when done.
1095 ///
1096 /// # Errors
1097 ///
1098 /// Returns [`BackendError`] when the buffer was not allocated by this
1099 /// backend or the underlying free fails.
1100 fn free_device_buffer(&self, _buffer: Box<dyn DeviceBuffer>) -> Result<(), BackendError> {
1101 Err(unsupported_device_buffer(self.id()))
1102 }
1103
1104 /// Dispatch a Program with backend-owned device buffers as inputs and
1105 /// outputs.
1106 ///
1107 /// Backends that have implemented [`Self::allocate_device_buffer`]
1108 /// override this method to bind their concrete buffer type without
1109 /// the host upload/download round trip the legacy `dispatch` API
1110 /// requires. Backends that have not opted in return
1111 /// [`BackendError::UnsupportedFeature`]. Callers that choose this
1112 /// API are asking for resident-buffer execution; falling back to
1113 /// host-buffer dispatch would hide the copy cost and violate the
1114 /// performance contract.
1115 ///
1116 /// # Errors
1117 ///
1118 /// Returns [`BackendError::UnsupportedFeature`] by default. Real
1119 /// implementations return any error encountered during dispatch.
1120 fn dispatch_with_device_buffers(
1121 &self,
1122 _program: &Program,
1123 _inputs: &[&dyn DeviceBuffer],
1124 _outputs: &mut [&mut dyn DeviceBuffer],
1125 _config: &DispatchConfig,
1126 ) -> Result<(), BackendError> {
1127 Err(unsupported_device_buffer(self.id()))
1128 }
1129}
1130
1131fn elapsed_resident_sequence_wall_ns(started: std::time::Instant) -> Result<u64, BackendError> {
1132 u64::try_from(started.elapsed().as_nanos()).map_err(|error| BackendError::InvalidProgram {
1133 fix: format!(
1134 "Fix: resident sequence wall timing cannot fit u64 nanoseconds: {error}. Split telemetry windows or report per-step timing."
1135 ),
1136 })
1137}
1138
1139fn sum_optional_timing(
1140 accumulator: Option<u64>,
1141 next: Option<u64>,
1142 field: &'static str,
1143) -> Result<Option<u64>, BackendError> {
1144 match (accumulator, next) {
1145 (Some(left), Some(right)) => Ok(Some(left.checked_add(right).ok_or_else(|| {
1146 BackendError::InvalidProgram {
1147 fix: format!(
1148 "Fix: resident sequence {field} overflowed u64 nanoseconds. Split telemetry windows or report per-step timing instead of silently clamping."
1149 ),
1150 }
1151 })?)),
1152 _ => Ok(None),
1153 }
1154}
1155
1156#[cfg(test)]
1157
1158mod tests {
1159 use super::*;
1160 use std::sync::atomic::{AtomicUsize, Ordering};
1161
1162 struct TelemetryBackend;
1163
1164 impl private::Sealed for TelemetryBackend {}
1165
1166 impl VyreBackend for TelemetryBackend {
1167 fn id(&self) -> &'static str {
1168 "telemetry-test"
1169 }
1170
1171 fn dispatch(
1172 &self,
1173 _program: &Program,
1174 _inputs: &[Vec<u8>],
1175 _config: &DispatchConfig,
1176 ) -> Result<Vec<Vec<u8>>, BackendError> {
1177 Ok(vec![vec![1, 2], vec![3, 4]])
1178 }
1179 }
1180
1181 struct SequenceTimingBackend {
1182 dispatches: AtomicUsize,
1183 }
1184
1185 impl private::Sealed for SequenceTimingBackend {}
1186
1187 impl VyreBackend for SequenceTimingBackend {
1188 fn id(&self) -> &'static str {
1189 "sequence-timing-test"
1190 }
1191
1192 fn dispatch(
1193 &self,
1194 _program: &Program,
1195 _inputs: &[Vec<u8>],
1196 _config: &DispatchConfig,
1197 ) -> Result<Vec<Vec<u8>>, BackendError> {
1198 Ok(Vec::new())
1199 }
1200
1201 fn dispatch_resident_timed(
1202 &self,
1203 _program: &Program,
1204 _resources: &[Resource],
1205 config: &DispatchConfig,
1206 ) -> Result<TimedDispatchResult, BackendError> {
1207 let index = self.dispatches.fetch_add(1, Ordering::SeqCst) as u64;
1208 assert_eq!(
1209 config.grid_override,
1210 Some([index as u32 + 1, 1, 1]),
1211 "Fix: default resident sequence timing must preserve each step's grid override."
1212 );
1213 Ok(TimedDispatchResult {
1214 outputs: Vec::new(),
1215 wall_ns: 10 + index,
1216 device_ns: Some(7 + index),
1217 enqueue_ns: Some(3 + index),
1218 wait_ns: Some(4 + index),
1219 })
1220 }
1221
1222 fn download_resident_ranges_into(
1223 &self,
1224 ranges: &[(&Resource, usize, usize)],
1225 outputs: &mut [&mut Vec<u8>],
1226 ) -> Result<(), BackendError> {
1227 assert_eq!(ranges.len(), outputs.len());
1228 for ((resource, offset, len), output) in ranges.iter().zip(outputs.iter_mut()) {
1229 let Resource::Resident(id) = resource else {
1230 panic!("Fix: default timed resident sequence test expects resident resources.");
1231 };
1232 output.clear();
1233 output.extend_from_slice(&id.to_le_bytes());
1234 output.extend_from_slice(&(*offset as u64).to_le_bytes());
1235 output.extend_from_slice(&(*len as u64).to_le_bytes());
1236 }
1237 Ok(())
1238 }
1239 }
1240
1241 #[test]
1242 fn default_borrowed_into_dispatch_records_runtime_telemetry() {
1243 let _guard = crate::observability::audit_events_test_lock();
1244 let before = crate::observability::snapshot_dispatch_telemetry();
1245 let backend = TelemetryBackend;
1246 let mut outputs = vec![Vec::with_capacity(4), Vec::with_capacity(1)];
1247
1248 backend
1249 .dispatch_borrowed_into(
1250 &Program::empty(),
1251 &[&[9, 8, 7]],
1252 &DispatchConfig::default(),
1253 &mut outputs,
1254 )
1255 .expect("Fix: default borrowed-into dispatch must succeed");
1256
1257 let telemetry = crate::observability::snapshot_dispatch_telemetry();
1258 assert!(telemetry.launches >= before.launches + 1);
1259 assert!(telemetry.input_bytes >= before.input_bytes + 3);
1260 assert!(telemetry.output_bytes >= before.output_bytes + 4);
1261 assert!(telemetry.output_slots >= before.output_slots + 2);
1262 assert!(telemetry.output_slots_reused >= before.output_slots_reused + 1);
1263 assert!(telemetry.output_slots_moved >= before.output_slots_moved + 1);
1264 assert!(telemetry.output_slots_appended >= before.output_slots_appended);
1265 }
1266
1267 #[test]
1268 fn default_resident_sequence_timing_sums_step_device_times_and_reads_ranges() {
1269 let backend = SequenceTimingBackend {
1270 dispatches: AtomicUsize::new(0),
1271 };
1272 let program = Program::empty();
1273 let first_resources = [Resource::Resident(11)];
1274 let second_resources = [Resource::Resident(22)];
1275 let steps = [
1276 ResidentDispatchStep {
1277 program: &program,
1278 resources: &first_resources,
1279 grid_override: Some([1, 1, 1]),
1280 workgroup_override: None,
1281 },
1282 ResidentDispatchStep {
1283 program: &program,
1284 resources: &second_resources,
1285 grid_override: Some([2, 1, 1]),
1286 workgroup_override: None,
1287 },
1288 ];
1289 let read_resource = Resource::Resident(33);
1290 let reads = [ResidentReadRange {
1291 resource: &read_resource,
1292 byte_offset: 4,
1293 byte_len: 8,
1294 }];
1295 let mut output = Vec::new();
1296
1297 let timing = backend
1298 .dispatch_resident_sequence_read_ranges_timed_into(&steps, &reads, &mut [&mut output])
1299 .expect("Fix: default timed resident sequence must execute and read ranges.");
1300
1301 assert_eq!(backend.dispatches.load(Ordering::SeqCst), 2);
1302 assert_eq!(timing.device_ns, Some(15));
1303 assert_eq!(timing.enqueue_ns, Some(7));
1304 assert_eq!(timing.wait_ns, Some(9));
1305 assert!(timing.wall_ns > 0);
1306 assert_eq!(output.len(), 24);
1307 assert_eq!(u64::from_le_bytes(output[0..8].try_into().unwrap()), 33);
1308 assert_eq!(u64::from_le_bytes(output[8..16].try_into().unwrap()), 4);
1309 assert_eq!(u64::from_le_bytes(output[16..24].try_into().unwrap()), 8);
1310 }
1311
1312 #[test]
1313 fn default_dispatch_paths_use_shared_fallible_staging_and_checked_timing() {
1314 let backend_source = include_str!("vyre_backend.rs");
1315 let compiled_source = include_str!("compiled_pipeline.rs");
1316 let module_source = include_str!("../backend.rs");
1317
1318 assert!(
1319 module_source.contains("fn clone_borrowed_inputs_for_dispatch")
1320 && module_source.contains("fn reserve_batch_output_slots")
1321 && module_source.contains("fn checked_elapsed_wall_ns"),
1322 "Fix: backend defaults must share one fallible staging and checked timing contract."
1323 );
1324 for source in [backend_source, compiled_source] {
1325 let production = source
1326 .split("#[cfg(test)]")
1327 .next()
1328 .expect("Fix: backend source must contain production section before tests");
1329 assert!(
1330 production.contains("clone_borrowed_inputs_for_dispatch")
1331 && production.contains("checked_elapsed_wall_ns")
1332 && !production.contains(".as_nanos() as u64")
1333 && !production.contains("inputs.iter().map(|input| (*input).to_vec()).collect()"),
1334 "Fix: inherited backend dispatch defaults must avoid infallible borrowed-input collection and lossy wall timing."
1335 );
1336 }
1337 let compiled_production = compiled_source
1338 .split("#[cfg(test)]")
1339 .next()
1340 .expect("Fix: compiled pipeline source must contain production section before tests");
1341 assert!(
1342 compiled_production.contains("reserved_batch_output_slots")
1343 && !compiled_production.contains("Vec::with_capacity(batches.len())"),
1344 "Fix: compiled-pipeline batch defaults must construct output slots through shared fallible staging."
1345 );
1346 }
1347}