vyre_driver/backend/vyre_backend.rs
1//! The frozen `VyreBackend` contract.
2
3use std::sync::Arc;
4
5use smallvec::SmallVec;
6use vyre_foundation::ir::Program;
7
8use crate::backend::{
9 device_buffer::unsupported_device_buffer, private, BackendError, CompiledPipeline,
10 DeviceBuffer, DispatchConfig, OutputBuffers, PendingDispatch, Resource, TimedDispatchResult,
11};
12
13/// One backend-resident program dispatch in an ordered sequence.
14pub struct ResidentDispatchStep<'a> {
15 /// Program to dispatch.
16 pub program: &'a Program,
17 /// Resident resources in binding order.
18 pub resources: &'a [Resource],
19 /// Optional CUDA/grid-style launch override.
20 pub grid_override: Option<[u32; 3]>,
21}
22
23/// One compact byte range to read from a backend-resident resource.
24pub struct ResidentReadRange<'a> {
25 /// Resident resource to read.
26 pub resource: &'a Resource,
27 /// Byte offset inside the resident resource.
28 pub byte_offset: usize,
29 /// Number of bytes to read.
30 pub byte_len: usize,
31}
32
33/// Timing captured for an ordered resident dispatch sequence.
34#[derive(Clone, Debug, Default, Eq, PartialEq)]
35pub struct ResidentSequenceTiming {
36 /// Host-observed sequence duration including requested readbacks.
37 pub wall_ns: u64,
38 /// Device-observed elapsed dispatch time when the backend exposes timers.
39 pub device_ns: Option<u64>,
40 /// Host time spent enqueueing backend work before waiting.
41 pub enqueue_ns: Option<u64>,
42 /// Host time spent waiting for completion and collecting outputs.
43 pub wait_ns: Option<u64>,
44}
45
46/// The frozen contract between vyre and every execution backend.
47///
48/// A backend is a pure function from a validated `Program` and input buffers
49/// to output buffers. Implementations must be `Send + Sync`, deterministic
50/// for identical inputs, and byte-identical to the CPU reference on success.
51/// This trait is the keystone of the vyre abstraction thesis: frontends do
52/// not know which backend runs their IR, and backends do not know which
53/// frontend produced it.
54///
55/// # Examples
56///
57pub trait VyreBackend: private::Sealed + Send + Sync {
58 /// Stable backend identifier used for logging, certificates, and adapter selection.
59 ///
60 /// The identifier must be unique among all backends linked into the
61 /// current process. Conformance reports include this string so that
62 /// consumers know exactly which implementation was certified.
63 fn id(&self) -> &'static str;
64
65 /// Backend implementation version string used for certificates and
66 /// regression tracking.
67 ///
68 /// The default returns `"unspecified"`. Concrete backends should
69 /// override this with their crate version (e.g. `"0.4.0"`) so that
70 /// certificates can detect backend upgrades that may require re-cert.
71 fn version(&self) -> &'static str {
72 "unspecified"
73 }
74
75 /// Operation ids this backend can execute without further lowering.
76 fn supported_ops(&self) -> &std::collections::HashSet<vyre_foundation::ir::OpId> {
77 use crate::backend::validation::default_supported_ops;
78 default_supported_ops()
79 }
80
81 // Raw backend shader text is a concrete-driver implementation
82 // detail, not part of the substrate-neutral `VyreBackend`
83 // contract.
84
85 /// Executes the program with the given input buffers and returns the output buffers.
86 ///
87 /// On success the returned bytes must match the pure-Rust reference
88 /// implementation bit-for-bit. On failure the backend must return a
89 /// [`BackendError`] whose message contains an actionable `Fix: ` hint.
90 ///
91 /// # Examples
92 ///
93 /// ```no_run
94 /// use vyre::{Program, VyreBackend, DispatchConfig};
95 ///
96 /// # fn example(backend: &dyn VyreBackend, program: &Program) -> Result<Vec<Vec<u8>>, vyre::BackendError> {
97 /// let inputs = vec![vec![1u8, 2, 3]];
98 /// let config = DispatchConfig::default();
99 /// backend.dispatch(program, &inputs, &config)
100 /// # }
101 /// ```
102 ///
103 /// # Errors
104 ///
105 /// Returns [`BackendError`] when the backend cannot complete dispatch.
106 /// The error message always includes a `Fix: ` remediation section.
107 fn dispatch(
108 &self,
109 program: &Program,
110 inputs: &[Vec<u8>],
111 config: &DispatchConfig,
112 ) -> Result<Vec<Vec<u8>>, BackendError>;
113
114 /// Executes the program with borrowed input buffers.
115 ///
116 /// Backends may override this method to avoid staging borrowed bytes into
117 /// owned `Vec<u8>` buffers. The default is non-breaking: it performs one
118 /// owned vector allocation for the call and delegates to
119 /// [`VyreBackend::dispatch`].
120 ///
121 /// # Errors
122 ///
123 /// Returns [`BackendError`] when the backend cannot complete dispatch.
124 fn dispatch_borrowed(
125 &self,
126 program: &Program,
127 inputs: &[&[u8]],
128 config: &DispatchConfig,
129 ) -> Result<Vec<Vec<u8>>, BackendError> {
130 let owned =
131 crate::backend::clone_borrowed_inputs_for_dispatch(inputs, "backend input staging")?;
132 let outputs = self.dispatch(program, &owned, config)?;
133 crate::observability::record_dispatch_io(inputs, &outputs);
134 Ok(outputs)
135 }
136
137 /// Executes a borrowed-input dispatch and returns backend-owned timing.
138 ///
139 /// The default records host wall time and delegates to
140 /// [`VyreBackend::dispatch_borrowed`]. Device-specific backends override
141 /// this only inside their driver crates so benchmark crates never import
142 /// vendor APIs directly.
143 ///
144 /// # Errors
145 ///
146 /// Returns [`BackendError`] when the backend cannot complete dispatch.
147 fn dispatch_borrowed_timed(
148 &self,
149 program: &Program,
150 inputs: &[&[u8]],
151 config: &DispatchConfig,
152 ) -> Result<TimedDispatchResult, BackendError> {
153 let started = std::time::Instant::now();
154 let outputs = self.dispatch_borrowed(program, inputs, config)?;
155 Ok(TimedDispatchResult {
156 outputs,
157 wall_ns: crate::backend::checked_elapsed_wall_ns(started, "backend borrowed dispatch")?,
158 device_ns: None,
159 enqueue_ns: None,
160 wait_ns: None,
161 })
162 }
163
164 /// Executes the program with borrowed input buffers and writes outputs into
165 /// caller-owned storage.
166 ///
167 /// Backends may override this method to reuse output buffers across
168 /// dispatches. The default preserves the existing dispatch contract and
169 /// copies returned bytes into existing output slots where possible.
170 ///
171 /// # Errors
172 ///
173 /// Returns [`BackendError`] when the backend cannot complete dispatch.
174 fn dispatch_borrowed_into(
175 &self,
176 program: &Program,
177 inputs: &[&[u8]],
178 config: &DispatchConfig,
179 outputs: &mut OutputBuffers,
180 ) -> Result<(), BackendError> {
181 let result = self.dispatch_borrowed(program, inputs, config)?;
182 let stats = crate::backend::dispatch_result::replace_output_buffers_preserving_slots_with_memory_stats(
183 result,
184 outputs,
185 );
186 crate::observability::record_output_replacement_stats(stats);
187 Ok(())
188 }
189
190 /// Allocate a backend-resident buffer and return a stable resource handle.
191 ///
192 /// Backends that support resident resources override this method so callers
193 /// can keep hot inputs on the device without importing a concrete driver
194 /// crate. The returned [`Resource`] is only meaningful to the backend that
195 /// produced it.
196 ///
197 /// # Errors
198 ///
199 /// Returns [`BackendError`] when the backend cannot allocate a resident
200 /// resource of the requested size.
201 fn allocate_resident(&self, _byte_len: usize) -> Result<Resource, BackendError> {
202 Err(BackendError::UnsupportedFeature {
203 name: "resident buffer allocation".to_string(),
204 backend: self.id().to_string(),
205 })
206 }
207
208 /// Upload bytes into a backend-resident resource.
209 ///
210 /// # Errors
211 ///
212 /// Returns [`BackendError`] when the resource is not owned by this backend
213 /// or the byte length does not match the resident allocation.
214 fn upload_resident(&self, _resource: &Resource, _bytes: &[u8]) -> Result<(), BackendError> {
215 Err(BackendError::UnsupportedFeature {
216 name: "resident buffer upload".to_string(),
217 backend: self.id().to_string(),
218 })
219 }
220
221 /// Upload several backend-resident resources as one logical staging
222 /// operation.
223 ///
224 /// Backends that support resident graph/dataflow hot paths should override
225 /// this with a native batched transfer. The default fails loudly instead
226 /// of looping over [`VyreBackend::upload_resident`], because a hidden
227 /// per-buffer synchronization loop destroys the performance contract.
228 ///
229 /// # Errors
230 ///
231 /// Returns [`BackendError`] when the backend cannot batch resident uploads
232 /// or when any resource/byte length is invalid.
233 fn upload_resident_many(&self, _uploads: &[(&Resource, &[u8])]) -> Result<(), BackendError> {
234 Err(BackendError::UnsupportedFeature {
235 name: "resident buffer batch upload".to_string(),
236 backend: self.id().to_string(),
237 })
238 }
239
240 /// Upload bytes into a subrange of a backend-resident resource.
241 ///
242 /// This is the hot-loop path for reusable resident slots whose capacity is
243 /// larger than the current logical payload. Backends must not require the
244 /// upload length to equal the allocation length.
245 ///
246 /// # Errors
247 ///
248 /// Returns [`BackendError`] when ranged upload is unsupported, the resource
249 /// is not owned by this backend, or the destination range is out of bounds.
250 fn upload_resident_at(
251 &self,
252 _resource: &Resource,
253 _dst_offset_bytes: usize,
254 _bytes: &[u8],
255 ) -> Result<(), BackendError> {
256 Err(BackendError::UnsupportedFeature {
257 name: "resident buffer ranged upload".to_string(),
258 backend: self.id().to_string(),
259 })
260 }
261
262 /// Upload several resident subranges as one logical staging operation.
263 ///
264 /// The default fails loudly instead of looping over
265 /// [`VyreBackend::upload_resident_at`], because hidden per-range
266 /// synchronization breaks resident hot-loop performance.
267 ///
268 /// # Errors
269 ///
270 /// Returns [`BackendError`] when ranged batch upload is unsupported or when
271 /// any resource/range is invalid.
272 fn upload_resident_at_many(
273 &self,
274 _uploads: &[(&Resource, usize, &[u8])],
275 ) -> Result<(), BackendError> {
276 Err(BackendError::UnsupportedFeature {
277 name: "resident buffer ranged batch upload".to_string(),
278 backend: self.id().to_string(),
279 })
280 }
281
282 /// Download a backend-resident resource into a new host buffer.
283 ///
284 /// Prefer [`VyreBackend::download_resident_into`] in hot loops so repeated
285 /// validation does not allocate a fresh `Vec` for every readback.
286 ///
287 /// # Errors
288 ///
289 /// Returns [`BackendError`] when the backend cannot download resident
290 /// resources or when `resource` is not owned by this backend.
291 fn download_resident(&self, resource: &Resource) -> Result<Vec<u8>, BackendError> {
292 let mut bytes = Vec::new();
293 self.download_resident_into(resource, &mut bytes)?;
294 Ok(bytes)
295 }
296
297 /// Download a backend-resident resource into caller-owned storage.
298 ///
299 /// Implementations must clear and reuse `out`; hidden compatibility
300 /// allocation defeats resident hot-loop validation.
301 ///
302 /// # Errors
303 ///
304 /// Returns [`BackendError`] when resident download is unsupported or when
305 /// `resource` is not owned by this backend.
306 fn download_resident_into(
307 &self,
308 _resource: &Resource,
309 _out: &mut Vec<u8>,
310 ) -> Result<(), BackendError> {
311 Err(BackendError::UnsupportedFeature {
312 name: "resident buffer download".to_string(),
313 backend: self.id().to_string(),
314 })
315 }
316
317 /// Download a byte range from a backend-resident resource into a new host
318 /// buffer.
319 ///
320 /// Prefer [`VyreBackend::download_resident_range_into`] in hot loops.
321 ///
322 /// # Errors
323 ///
324 /// Returns [`BackendError`] when resident ranged download is unsupported,
325 /// the range is invalid, or `resource` is not owned by this backend.
326 fn download_resident_range(
327 &self,
328 resource: &Resource,
329 byte_offset: usize,
330 byte_len: usize,
331 ) -> Result<Vec<u8>, BackendError> {
332 let mut bytes = Vec::new();
333 bytes.try_reserve_exact(byte_len).map_err(|error| {
334 BackendError::InvalidProgram {
335 fix: format!(
336 "Fix: resident ranged download could not reserve {byte_len} output byte(s): {error}. Split the readback range before dispatch."
337 ),
338 }
339 })?;
340 self.download_resident_range_into(resource, byte_offset, byte_len, &mut bytes)?;
341 Ok(bytes)
342 }
343
344 /// Download a byte range from a backend-resident resource into
345 /// caller-owned storage.
346 ///
347 /// # Errors
348 ///
349 /// Returns [`BackendError`] when resident ranged download is unsupported,
350 /// the range is invalid, or `resource` is not owned by this backend.
351 fn download_resident_range_into(
352 &self,
353 _resource: &Resource,
354 _byte_offset: usize,
355 _byte_len: usize,
356 _out: &mut Vec<u8>,
357 ) -> Result<(), BackendError> {
358 Err(BackendError::UnsupportedFeature {
359 name: "resident buffer ranged download".to_string(),
360 backend: self.id().to_string(),
361 })
362 }
363
364 /// Download several byte ranges from backend-resident resources into
365 /// caller-owned storage as one logical readback operation.
366 ///
367 /// Backends with a real multi-readback path should override this to issue
368 /// all copies behind one backend synchronization boundary.
369 ///
370 /// # Errors
371 ///
372 /// Returns [`BackendError`] when counts do not match, any range is invalid,
373 /// or resident ranged download is unsupported.
374 fn download_resident_ranges_into(
375 &self,
376 ranges: &[(&Resource, usize, usize)],
377 outputs: &mut [&mut Vec<u8>],
378 ) -> Result<(), BackendError> {
379 if ranges.len() != outputs.len() {
380 return Err(BackendError::InvalidProgram {
381 fix: format!(
382 "Fix: resident ranged batch download expected matching range/output counts but got {} range(s) and {} output(s).",
383 ranges.len(),
384 outputs.len()
385 ),
386 });
387 }
388 for ((resource, byte_offset, byte_len), output) in ranges.iter().zip(outputs.iter_mut()) {
389 self.download_resident_range_into(resource, *byte_offset, *byte_len, output)?;
390 }
391 Ok(())
392 }
393
394 /// Free a backend-resident resource previously returned by
395 /// [`VyreBackend::allocate_resident`].
396 ///
397 /// # Errors
398 ///
399 /// Returns [`BackendError`] when the resource is unknown or still in use.
400 fn free_resident(&self, _resource: Resource) -> Result<(), BackendError> {
401 Err(BackendError::UnsupportedFeature {
402 name: "resident buffer free".to_string(),
403 backend: self.id().to_string(),
404 })
405 }
406
407 /// Dispatch using backend-resident resources and return backend-owned
408 /// timing.
409 ///
410 /// # Errors
411 ///
412 /// Returns [`BackendError`] when the backend does not support resident
413 /// dispatch or any resource is invalid for the program.
414 fn dispatch_resident_timed(
415 &self,
416 _program: &Program,
417 _resources: &[Resource],
418 _config: &DispatchConfig,
419 ) -> Result<TimedDispatchResult, BackendError> {
420 Err(BackendError::UnsupportedFeature {
421 name: "resident timed dispatch".to_string(),
422 backend: self.id().to_string(),
423 })
424 }
425
426 /// Dispatch an ordered sequence of resident-buffer programs and read
427 /// selected resident byte ranges into caller-owned storage.
428 ///
429 /// The default preserves correctness by dispatching each step through
430 /// [`VyreBackend::dispatch_resident_timed`] and then calling
431 /// [`VyreBackend::download_resident_ranges_into`]. CUDA overrides this to
432 /// enqueue the whole dependent chain plus D2H readbacks on one stream and
433 /// pay one host synchronization point.
434 ///
435 /// # Errors
436 ///
437 /// Returns [`BackendError`] when any step fails, when readback ranges are
438 /// invalid, or when the backend cannot perform resident dispatch/readback.
439 fn dispatch_resident_sequence_read_ranges_into(
440 &self,
441 steps: &[ResidentDispatchStep<'_>],
442 read_ranges: &[ResidentReadRange<'_>],
443 outputs: &mut [&mut Vec<u8>],
444 ) -> Result<(), BackendError> {
445 for step in steps {
446 let mut config = DispatchConfig::default();
447 config.grid_override = step.grid_override;
448 self.dispatch_resident_timed(step.program, step.resources, &config)?;
449 }
450 let ranges = read_ranges
451 .iter()
452 .map(|range| (range.resource, range.byte_offset, range.byte_len))
453 .collect::<SmallVec<[_; 8]>>();
454 self.download_resident_ranges_into(&ranges, outputs)
455 }
456
457 /// Timed variant of
458 /// [`VyreBackend::dispatch_resident_sequence_read_ranges_into`].
459 ///
460 /// The default preserves correctness by summing each step's resident
461 /// dispatch timing and then downloading the requested ranges. Backends with
462 /// a fused resident sequence path should override this to keep their
463 /// optimized stream/queue behavior while exposing device timing.
464 ///
465 /// # Errors
466 ///
467 /// Returns [`BackendError`] when any step fails, when readback ranges are
468 /// invalid, or when resident sequence timing overflows.
469 fn dispatch_resident_sequence_read_ranges_timed_into(
470 &self,
471 steps: &[ResidentDispatchStep<'_>],
472 read_ranges: &[ResidentReadRange<'_>],
473 outputs: &mut [&mut Vec<u8>],
474 ) -> Result<ResidentSequenceTiming, BackendError> {
475 let started = std::time::Instant::now();
476 let mut device_ns = Some(0_u64);
477 let mut enqueue_ns = Some(0_u64);
478 let mut wait_ns = Some(0_u64);
479 for step in steps {
480 let mut config = DispatchConfig::default();
481 config.grid_override = step.grid_override;
482 let timed = self.dispatch_resident_timed(step.program, step.resources, &config)?;
483 device_ns = sum_optional_timing(device_ns, timed.device_ns, "device timing")?;
484 enqueue_ns = sum_optional_timing(enqueue_ns, timed.enqueue_ns, "enqueue timing")?;
485 wait_ns = sum_optional_timing(wait_ns, timed.wait_ns, "wait timing")?;
486 }
487 let ranges = read_ranges
488 .iter()
489 .map(|range| (range.resource, range.byte_offset, range.byte_len))
490 .collect::<SmallVec<[_; 8]>>();
491 self.download_resident_ranges_into(&ranges, outputs)?;
492 Ok(ResidentSequenceTiming {
493 wall_ns: elapsed_resident_sequence_wall_ns(started)?,
494 device_ns,
495 enqueue_ns,
496 wait_ns,
497 })
498 }
499
500 /// Dispatch a resident prefix, repeat a resident sub-sequence, and read
501 /// selected resident byte ranges into caller-owned storage.
502 ///
503 /// This is the fixed-point hot-path contract: dataflow clients can express
504 /// a repeated kernel group without allocating one host sequence entry per
505 /// iteration. Backends that understand repetition should override this to
506 /// keep launch preparation and parameter upload sublinear in
507 /// `repeat_count`.
508 ///
509 /// # Errors
510 ///
511 /// Returns [`BackendError`] when any step fails, when readback ranges are
512 /// invalid, or when the backend cannot perform resident dispatch/readback.
513 fn dispatch_resident_repeated_sequence_read_ranges_into(
514 &self,
515 prefix_steps: &[ResidentDispatchStep<'_>],
516 repeated_steps: &[ResidentDispatchStep<'_>],
517 repeat_count: u32,
518 read_ranges: &[ResidentReadRange<'_>],
519 outputs: &mut [&mut Vec<u8>],
520 ) -> Result<(), BackendError> {
521 for step in prefix_steps {
522 let mut config = DispatchConfig::default();
523 config.grid_override = step.grid_override;
524 self.dispatch_resident_timed(step.program, step.resources, &config)?;
525 }
526 for _ in 0..repeat_count {
527 for step in repeated_steps {
528 let mut config = DispatchConfig::default();
529 config.grid_override = step.grid_override;
530 self.dispatch_resident_timed(step.program, step.resources, &config)?;
531 }
532 }
533 let ranges = read_ranges
534 .iter()
535 .map(|range| (range.resource, range.byte_offset, range.byte_len))
536 .collect::<SmallVec<[_; 8]>>();
537 self.download_resident_ranges_into(&ranges, outputs)
538 }
539
540 /// Optional pre-compilation hook for the pipeline-mode API.
541 ///
542 /// Default returns `Ok(None)` - the framework wraps in a passthrough
543 /// pipeline whose `dispatch` calls back into [`VyreBackend::dispatch`]
544 /// every time. Backends that genuinely cache compiled state (compute
545 /// pipeline, bind-group layout, lowered shader text) override this and
546 /// return `Ok(Some(...))` so repeated dispatches skip the compilation
547 /// overhead.
548 ///
549 /// The returned pipeline MUST be bit-identical to repeated
550 /// `dispatch(program, inputs, config)` for the program it was compiled
551 /// from. The cache key is the backend's responsibility - the framework
552 /// does not deduplicate compile calls.
553 ///
554 /// Implementing this method is the P-6 contract from
555 /// `docs/audits/ROADMAP_PERFORMANCE.md`: "compile target-text + pipeline +
556 /// bind-group-layout once; dispatch repeatedly with different inputs."
557 ///
558 /// # Errors
559 ///
560 /// Returns [`BackendError`] when the backend cannot complete the
561 /// pre-compilation. Callers should treat this as fatal for the program
562 /// (the program will not dispatch successfully via any path).
563 fn compile_native(
564 &self,
565 _program: &Program,
566 _config: &DispatchConfig,
567 ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
568 Ok(None)
569 }
570
571 /// Optional pre-compilation hook for callers that already own a shared
572 /// program allocation.
573 ///
574 /// Backends that store the program inside the compiled pipeline should
575 /// override this method and keep the supplied [`Arc<Program>`] instead of
576 /// cloning the IR. The default preserves the older borrowed-program hook
577 /// for backends that only inspect the program while compiling.
578 ///
579 /// # Errors
580 ///
581 /// Returns [`BackendError`] when backend-native compilation fails.
582 fn compile_native_shared(
583 &self,
584 program: Arc<Program>,
585 config: &DispatchConfig,
586 ) -> Result<Option<Arc<dyn CompiledPipeline>>, BackendError> {
587 self.compile_native(&program, config)
588 }
589
590 /// Optional compiled-pipeline cache counters for compile telemetry.
591 ///
592 /// Return `None` unless the backend can report real cache hits and misses.
593 fn pipeline_cache_snapshot(&self) -> Option<crate::pipeline::PipelineCacheSnapshot> {
594 None
595 }
596
597 /// Optional backend-specific numeric telemetry for release evidence.
598 ///
599 /// Return an empty vector unless the backend can report real counters.
600 /// Metric names must be stable ASCII identifiers suitable for JSON and
601 /// Prometheus-style export.
602 fn backend_metric_snapshot(&self) -> Vec<(&'static str, u64)> {
603 Vec::new()
604 }
605
606 /// Non-blocking dispatch primitive.
607 ///
608 /// Returns a [`PendingDispatch`] handle immediately; the caller
609 /// polls via [`PendingDispatch::is_ready`] and consumes the result
610 /// via [`PendingDispatch::await_result`]. Backends that genuinely
611 /// pipeline dispatches override this so N concurrent dispatches
612 /// do not serialize on the host.
613 ///
614 /// Default: run the synchronous [`VyreBackend::dispatch`] path and
615 /// wrap the result in a trivially-ready handle. This keeps every
616 /// backend useful from the async API without forcing an async
617 /// rewrite.
618 ///
619 /// # Errors
620 ///
621 /// Returns [`BackendError`] if the dispatch cannot start. Errors
622 /// that surface only during GPU execution come back through
623 /// [`PendingDispatch::await_result`], not from this call.
624 fn dispatch_async(
625 &self,
626 program: &Program,
627 inputs: &[Vec<u8>],
628 config: &DispatchConfig,
629 ) -> Result<Box<dyn PendingDispatch>, BackendError> {
630 let outputs = self.dispatch(program, inputs, config)?;
631 Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
632 outputs,
633 }))
634 }
635
636 /// Non-blocking dispatch with borrowed input buffers.
637 ///
638 /// Backends that record GPU commands synchronously before returning can
639 /// override this to avoid cloning input buffers just to create a pending
640 /// handle. The returned [`PendingDispatch`] must not borrow from `inputs`.
641 ///
642 /// # Errors
643 ///
644 /// Returns [`BackendError`] if the dispatch cannot start.
645 fn dispatch_borrowed_async(
646 &self,
647 program: &Program,
648 inputs: &[&[u8]],
649 config: &DispatchConfig,
650 ) -> Result<Box<dyn PendingDispatch>, BackendError> {
651 let outputs = self.dispatch_borrowed(program, inputs, config)?;
652 Ok(Box::new(crate::backend::pending_dispatch::ReadyPending {
653 outputs,
654 }))
655 }
656
657 // ---------------------------------------------------------------
658 // Capability queries (all default to conservative "no" / minimal).
659 //
660 // These are the stable capability surface. Additional backends implement
661 // this trait by default-inheriting every capability below and OVERRIDING
662 // only the ones where they are more capable than the conservative floor.
663 // This means adding a backend is strictly additive - no existing
664 // backend impl has to change when a new capability query is added.
665 //
666 // Backends MUST report HONESTLY. Returning `true` from a capability
667 // query is a promise the lowering path emits the corresponding
668 // intrinsic and the adapter supports it. "Supported but broken" is a
669 // LAW 9 evasion (see AGENTS.md). If the feature bit is set on the
670 // device but the lowering emits a slower emulation sequence, the
671 // answer is `false` until the native lowering catches up.
672 // ---------------------------------------------------------------
673
674 /// Whether this backend's lowering path emits subgroup / wave
675 /// intrinsics AND the current adapter exposes them.
676 ///
677 /// Default: `false` (conservative - assumes no native subgroup lowering).
678 #[must_use]
679 fn supports_subgroup_ops(&self) -> bool {
680 false
681 }
682
683 /// Whether this backend lowers IEEE 754 binary16 (`DataType::F16`)
684 /// natively rather than emulating through `f32`.
685 ///
686 /// Default: `false`.
687 #[must_use]
688 fn supports_f16(&self) -> bool {
689 false
690 }
691
692 /// Whether this backend lowers bfloat16 (`DataType::BF16`) natively.
693 ///
694 /// Default: `false`.
695 #[must_use]
696 fn supports_bf16(&self) -> bool {
697 false
698 }
699
700 /// Whether this backend emits tensor-core / matrix-engine intrinsics
701 /// for supported tensor shapes.
702 ///
703 /// Default: `false`.
704 #[must_use]
705 fn supports_tensor_cores(&self) -> bool {
706 false
707 }
708
709 /// Whether this backend overlaps copies and compute via independent
710 /// queues or async engines.
711 ///
712 /// Default: `false` (host serializes copy ↔ compute).
713 #[must_use]
714 fn supports_async_compute(&self) -> bool {
715 false
716 }
717
718 /// Whether this backend supports indirect dispatch
719 /// (`Node::IndirectDispatch`).
720 ///
721 /// Default: `false`.
722 #[must_use]
723 fn supports_indirect_dispatch(&self) -> bool {
724 false
725 }
726
727 /// Whether this backend supports speculative dispatch - a fused
728 /// prefilter + confirmer kernel with commit-gated output and a
729 /// counter tail read back by the host.
730 ///
731 /// Default: `false`.
732 #[must_use]
733 fn supports_speculation(&self) -> bool {
734 false
735 }
736
737 /// Whether this backend supports device-side persistent-thread
738 /// dispatch (a long-running kernel that polls a work queue).
739 ///
740 /// Default: `false`.
741 #[must_use]
742 fn supports_persistent_thread_dispatch(&self) -> bool {
743 false
744 }
745
746 /// Whether this backend can satisfy `Node::Barrier { ordering:
747 /// MemoryOrdering::GridSync }` inside a single dispatch - i.e.
748 /// every thread in the entire grid waits at the barrier and
749 /// every prior write is globally visible afterwards. Backends
750 /// that lack a native grid barrier (workgroup-only fences) must
751 /// return `false`; registration-based dispatch may lower a
752 /// `GridSync` barrier to a host-orchestrated kernel split only
753 /// when [`VyreBackend::allows_host_grid_sync_split`] also returns
754 /// `true`.
755 ///
756 /// Backends with cooperative whole-grid launch support can return
757 /// `true`; backends limited to workgroup-local synchronization return
758 /// `false` until the target exposes a compatible grid-barrier primitive.
759 ///
760 /// Default: `false`.
761 #[must_use]
762 fn supports_grid_sync(&self) -> bool {
763 false
764 }
765
766 /// Whether a native cooperative grid-sync launch of `program` with these
767 /// `inputs` and `config` can be made fully resident on this device.
768 ///
769 /// [`VyreBackend::supports_grid_sync`] reports that native lowering is
770 /// *available*; this reports whether it *fits* for a specific dispatch. A
771 /// cooperative launch requires every block co-resident, so a grid whose
772 /// block count exceeds the device's cooperative residency cannot run
773 /// natively and must route to the resident-fixpoint or host-split path.
774 /// Orchestrators call this to choose the native route only when it fits,
775 /// avoiding a wasted allocate/upload that would otherwise end in
776 /// [`crate::ErrorCode::CooperativeResidencyExceeded`].
777 ///
778 /// Default: `Ok(false)` (no native cooperative launch). Backends that lower
779 /// grid sync override this with the real residency check. Returns `Ok(false)`
780 /// — not an error — when the program carries no grid-sync barrier, since
781 /// there is then nothing to launch cooperatively.
782 ///
783 /// # Errors
784 ///
785 /// Returns [`BackendError`] if the launch geometry cannot be computed for
786 /// the program/inputs (a structurally invalid dispatch).
787 fn cooperative_grid_sync_fits(
788 &self,
789 _program: &Program,
790 _inputs: &[&[u8]],
791 _config: &DispatchConfig,
792 ) -> Result<bool, BackendError> {
793 Ok(false)
794 }
795
796 /// Whether the shared registry wrapper may emulate whole-grid
797 /// synchronization for this backend by splitting one program into
798 /// multiple host-dispatched kernels.
799 ///
800 /// This exists separately from [`VyreBackend::supports_grid_sync`]
801 /// because a backend can intentionally reject hidden host
802 /// orchestration while native cooperative-grid lowering is absent.
803 /// CUDA uses that policy in the release path so missing native
804 /// grid-barrier lowering is surfaced as an unsupported feature
805 /// instead of silently becoming a slower multi-launch path.
806 ///
807 /// Default: `true` to preserve existing behavior for simple
808 /// backends that intentionally rely on shared split lowering.
809 #[must_use]
810 fn allows_host_grid_sync_split(&self) -> bool {
811 true
812 }
813
814 /// Whether this backend implements the resident half of the contract
815 /// (`allocate_resident` / `upload_resident` / `dispatch_resident_timed` /
816 /// `dispatch_resident_repeated_sequence_read_ranges_into` /
817 /// `download_resident_*` / `free_resident`) well enough to run a
818 /// device-resident dispatch sequence.
819 ///
820 /// Consumers use this to choose
821 /// [`crate::grid_sync::dispatch_resident_grid_sync_fixpoint_into`] (which
822 /// keeps live buffers device-resident across every grid-sync segment and
823 /// fixpoint pass) over the host-orchestrated
824 /// [`crate::grid_sync::dispatch_with_grid_sync_split_into`] (which
825 /// round-trips every live buffer host↔device between segments). Both are
826 /// correct; the choice is a performance route on a probed capability, not
827 /// a silent failure fallback.
828 ///
829 /// Default: `false`. Backends that implement resident dispatch override
830 /// this to `true`.
831 #[must_use]
832 fn supports_resident_dispatch(&self) -> bool {
833 false
834 }
835
836 /// Whether this backend partitions a program across more than one
837 /// physical device / node.
838 ///
839 /// Default: `false` (single-device execution).
840 #[must_use]
841 fn is_distributed(&self) -> bool {
842 false
843 }
844
845 /// Whether this backend lowers distributed collective communication
846 /// nodes (`AllReduce`, `AllGather`, `ReduceScatter`, `Broadcast`).
847 ///
848 /// This is intentionally separate from [`VyreBackend::is_distributed`]:
849 /// a backend may partition work across devices without yet exposing a
850 /// correct collective transport/lowering stack. Default: `false`.
851 #[must_use]
852 fn supports_distributed_collectives(&self) -> bool {
853 false
854 }
855
856 /// Maximum supported workgroup size per axis `[x, y, z]`.
857 ///
858 /// Default: `[1, 1, 1]` (scalar dispatch - a backend that has not
859 /// reported a real limit cannot be trusted to execute parallel
860 /// workgroups).
861 #[must_use]
862 fn max_workgroup_size(&self) -> [u32; 3] {
863 [1, 1, 1]
864 }
865
866 /// Maximum number of compute workgroups the backend can launch in one
867 /// dispatch dimension.
868 ///
869 /// Default: `1`, which is safe for scalar/reference backends but must be
870 /// overridden by real GPU backends so schedulers do not under-launch.
871 #[must_use]
872 fn max_compute_workgroups_per_dimension(&self) -> u32 {
873 1
874 }
875
876 /// Maximum total invocations allowed in a single workgroup.
877 ///
878 /// Default derives from [`max_workgroup_size`](Self::max_workgroup_size)
879 /// and fails loudly if a backend reports an unrepresentable product.
880 #[must_use]
881 fn max_compute_invocations_per_workgroup(&self) -> u32 {
882 let [x, y, z] = self.max_workgroup_size();
883 let invocations = u128::from(x) * u128::from(y) * u128::from(z);
884 u32::try_from(invocations).unwrap_or(u32::MAX)
885 }
886
887 /// Native subgroup size for the backing device when the backend
888 /// knows it. Returning
889 /// `None` tells the dispatch planner the backend can't report a
890 /// subgroup width - the planner falls back to `max_workgroup_size`
891 /// for its sizing heuristic.
892 ///
893 /// I.6 - adaptive workgroup sizing reads this capability to pick
894 /// a workgroup multiple of the subgroup so threads don't straddle
895 /// subgroups. Typical devices expose 16, 32, or 64 lanes.
896 #[must_use]
897 fn subgroup_size(&self) -> Option<u32> {
898 None
899 }
900
901 /// Maximum size in bytes of a single storage buffer the backend
902 /// accepts. `0` means the backend has not reported a limit, not
903 /// "unlimited".
904 ///
905 /// Default: `0`.
906 #[must_use]
907 fn max_storage_buffer_bytes(&self) -> u64 {
908 0
909 }
910
911 /// Unified backend-neutral device profile.
912 ///
913 /// Shared planner code should prefer this single profile over reading
914 /// individual capability methods one by one. Concrete backends may
915 /// override it when they can report richer device facts such as shared
916 /// memory size or native lowering-strategy features.
917 #[must_use]
918 fn device_profile(&self) -> crate::DeviceProfile {
919 let max_workgroup_size = self.max_workgroup_size();
920 crate::DeviceProfile {
921 backend: self.id(),
922 supports_subgroup_ops: self.supports_subgroup_ops(),
923 supports_indirect_dispatch: self.supports_indirect_dispatch(),
924 supports_distributed_collectives: self.supports_distributed_collectives(),
925 supports_specialization_constants: false,
926 supports_f16: self.supports_f16(),
927 supports_bf16: self.supports_bf16(),
928 supports_trap_propagation: false,
929 supports_tensor_cores: self.supports_tensor_cores(),
930 has_mul_high: false,
931 has_dual_issue_fp32_int32: false,
932 has_subgroup_shuffle: self.supports_subgroup_ops(),
933 has_shared_memory: false,
934 max_native_int_width: 32,
935 max_workgroup_size,
936 max_invocations_per_workgroup: self.max_compute_invocations_per_workgroup(),
937 max_shared_memory_bytes: 0,
938 max_storage_buffer_binding_size: self.max_storage_buffer_bytes(),
939 subgroup_size: self.subgroup_size().unwrap_or(0),
940 compute_units: 0,
941 regs_per_thread_max: 0,
942 l1_cache_bytes: 0,
943 l2_cache_bytes: 0,
944 mem_bw_gbps: 0,
945 timing_quality: crate::DeviceTimingQuality::HostOnly,
946 supports_device_timestamps: false,
947 supports_hardware_counters: false,
948 ideal_unroll_depth: 0,
949 ideal_vector_pack_bits: 0,
950 ideal_workgroup_tile: [0, 0, 0],
951 shared_memory_bank_count: 0,
952 shared_memory_bank_width_bytes: 0,
953 }
954 }
955
956 // ---------------------------------------------------------------
957 // Lifecycle hooks (defaulted, override as needed).
958 //
959 // These let a backend warm caches, flush pending work, recover from
960 // device loss, or tear down cleanly. Every hook defaults to a
961 // no-op-or-structured-error, so existing impls do not have to add
962 // any code.
963 // ---------------------------------------------------------------
964
965 /// Pre-dispatch warmup. Called before the first dispatch on a new
966 /// program so the backend can warm caches, compile ahead-of-time, or
967 /// acquire a device handle without paying that cost on the hot path.
968 ///
969 /// Default: no-op `Ok(())`.
970 ///
971 /// # Errors
972 ///
973 /// Returns [`BackendError`] if warmup cannot complete.
974 fn prepare(&self) -> Result<(), BackendError> {
975 Ok(())
976 }
977
978 /// Flush any queued work to the device and wait for it to complete.
979 ///
980 /// Useful before tearing down a context or before reading back data
981 /// that was produced by the last asynchronous dispatch.
982 ///
983 /// Default: no-op `Ok(())` - backends that do not queue work
984 /// implicitly satisfy flush.
985 ///
986 /// # Errors
987 ///
988 /// Returns [`BackendError`] on device failure.
989 fn flush(&self) -> Result<(), BackendError> {
990 Ok(())
991 }
992
993 /// Release device resources held by this backend. After `shutdown`
994 /// returns the backend is in an unspecified state and may not be
995 /// used for further dispatches.
996 ///
997 /// Default: no-op `Ok(())`.
998 ///
999 /// # Errors
1000 ///
1001 /// Returns [`BackendError`] on device failure during teardown.
1002 fn shutdown(&self) -> Result<(), BackendError> {
1003 Ok(())
1004 }
1005
1006 /// Probe whether the underlying device has been lost since the last
1007 /// successful dispatch.
1008 ///
1009 /// Default: `false` (assume healthy - backends that have no
1010 /// device-loss story do not need to probe).
1011 #[must_use]
1012 fn device_lost(&self) -> bool {
1013 false
1014 }
1015
1016 /// Attempt to recover from device loss by reacquiring the underlying
1017 /// device and invalidating pipeline caches.
1018 ///
1019 /// Default: returns an `UnsupportedFeature` error - recovery must be
1020 /// opt-in, because a backend that silently re-acquires without
1021 /// notifying the caller is a correctness hazard.
1022 ///
1023 /// # Errors
1024 ///
1025 /// Returns [`BackendError::UnsupportedFeature`] by default. Backends
1026 /// that implement recovery return any error encountered during
1027 /// re-acquisition.
1028 fn try_recover(&self) -> Result<(), BackendError> {
1029 Err(BackendError::UnsupportedFeature {
1030 name: "device recovery".to_string(),
1031 backend: self.id().to_string(),
1032 })
1033 }
1034
1035 /// Allocate a backend-owned device buffer of `byte_len` bytes.
1036 ///
1037 /// The returned [`DeviceBuffer`] handle is only meaningful to the
1038 /// backend that produced it. Backends that have not opted in return
1039 /// [`BackendError::UnsupportedFeature`] with the
1040 /// `DEVICE_BUFFER_FEATURE` name; production callers that require
1041 /// resident-buffer performance must treat that as a hard capability
1042 /// failure rather than silently routing through host `Vec<u8>`
1043 /// dispatch. Real device backends (cuda/wgpu/spirv) override this to
1044 /// wrap their concrete handle (for example, a vendor device allocation,
1045 /// vulkan buffer) in a `DeviceBuffer` impl.
1046 ///
1047 /// See `crate::backend::device_buffer` for the substrate.
1048 ///
1049 /// # Errors
1050 ///
1051 /// Returns [`BackendError::UnsupportedFeature`] when the backend
1052 /// has not yet implemented persistent device-buffer allocation.
1053 fn allocate_device_buffer(
1054 &self,
1055 _byte_len: usize,
1056 ) -> Result<Box<dyn DeviceBuffer>, BackendError> {
1057 Err(unsupported_device_buffer(self.id()))
1058 }
1059
1060 /// Upload host bytes into a previously-allocated device buffer.
1061 ///
1062 /// # Errors
1063 ///
1064 /// Returns [`BackendError`] when the buffer was not allocated by this
1065 /// backend, the byte length does not match the allocation, or the
1066 /// backend has not opted in to device-buffer dispatch.
1067 fn upload_device_buffer(
1068 &self,
1069 _buffer: &mut dyn DeviceBuffer,
1070 _bytes: &[u8],
1071 ) -> Result<(), BackendError> {
1072 Err(unsupported_device_buffer(self.id()))
1073 }
1074
1075 /// Download bytes from a device buffer back to a host `Vec<u8>`.
1076 ///
1077 /// # Errors
1078 ///
1079 /// Returns [`BackendError`] when the buffer was not allocated by this
1080 /// backend or the backend has not opted in to device-buffer dispatch.
1081 fn download_device_buffer(&self, _buffer: &dyn DeviceBuffer) -> Result<Vec<u8>, BackendError> {
1082 Err(unsupported_device_buffer(self.id()))
1083 }
1084
1085 /// Free a device buffer previously returned by
1086 /// [`Self::allocate_device_buffer`]. Explicit-free is required
1087 /// because the substrate does not assume reference-counted backend
1088 /// handles; consumers are responsible for calling this when done.
1089 ///
1090 /// # Errors
1091 ///
1092 /// Returns [`BackendError`] when the buffer was not allocated by this
1093 /// backend or the underlying free fails.
1094 fn free_device_buffer(&self, _buffer: Box<dyn DeviceBuffer>) -> Result<(), BackendError> {
1095 Err(unsupported_device_buffer(self.id()))
1096 }
1097
1098 /// Dispatch a Program with backend-owned device buffers as inputs and
1099 /// outputs.
1100 ///
1101 /// Backends that have implemented [`Self::allocate_device_buffer`]
1102 /// override this method to bind their concrete buffer type without
1103 /// the host upload/download round trip the legacy `dispatch` API
1104 /// requires. Backends that have not opted in return
1105 /// [`BackendError::UnsupportedFeature`]. Callers that choose this
1106 /// API are asking for resident-buffer execution; falling back to
1107 /// host-buffer dispatch would hide the copy cost and violate the
1108 /// performance contract.
1109 ///
1110 /// # Errors
1111 ///
1112 /// Returns [`BackendError::UnsupportedFeature`] by default. Real
1113 /// implementations return any error encountered during dispatch.
1114 fn dispatch_with_device_buffers(
1115 &self,
1116 _program: &Program,
1117 _inputs: &[&dyn DeviceBuffer],
1118 _outputs: &mut [&mut dyn DeviceBuffer],
1119 _config: &DispatchConfig,
1120 ) -> Result<(), BackendError> {
1121 Err(unsupported_device_buffer(self.id()))
1122 }
1123}
1124
1125fn elapsed_resident_sequence_wall_ns(started: std::time::Instant) -> Result<u64, BackendError> {
1126 u64::try_from(started.elapsed().as_nanos()).map_err(|error| BackendError::InvalidProgram {
1127 fix: format!(
1128 "Fix: resident sequence wall timing cannot fit u64 nanoseconds: {error}. Split telemetry windows or report per-step timing."
1129 ),
1130 })
1131}
1132
1133fn sum_optional_timing(
1134 accumulator: Option<u64>,
1135 next: Option<u64>,
1136 field: &'static str,
1137) -> Result<Option<u64>, BackendError> {
1138 match (accumulator, next) {
1139 (Some(left), Some(right)) => Ok(Some(left.checked_add(right).ok_or_else(|| {
1140 BackendError::InvalidProgram {
1141 fix: format!(
1142 "Fix: resident sequence {field} overflowed u64 nanoseconds. Split telemetry windows or report per-step timing instead of silently clamping."
1143 ),
1144 }
1145 })?)),
1146 _ => Ok(None),
1147 }
1148}
1149
1150#[cfg(test)]
1151
1152mod tests {
1153 use super::*;
1154 use std::sync::atomic::{AtomicUsize, Ordering};
1155
1156 struct TelemetryBackend;
1157
1158 impl private::Sealed for TelemetryBackend {}
1159
1160 impl VyreBackend for TelemetryBackend {
1161 fn id(&self) -> &'static str {
1162 "telemetry-test"
1163 }
1164
1165 fn dispatch(
1166 &self,
1167 _program: &Program,
1168 _inputs: &[Vec<u8>],
1169 _config: &DispatchConfig,
1170 ) -> Result<Vec<Vec<u8>>, BackendError> {
1171 Ok(vec![vec![1, 2], vec![3, 4]])
1172 }
1173 }
1174
1175 struct SequenceTimingBackend {
1176 dispatches: AtomicUsize,
1177 }
1178
1179 impl private::Sealed for SequenceTimingBackend {}
1180
1181 impl VyreBackend for SequenceTimingBackend {
1182 fn id(&self) -> &'static str {
1183 "sequence-timing-test"
1184 }
1185
1186 fn dispatch(
1187 &self,
1188 _program: &Program,
1189 _inputs: &[Vec<u8>],
1190 _config: &DispatchConfig,
1191 ) -> Result<Vec<Vec<u8>>, BackendError> {
1192 Ok(Vec::new())
1193 }
1194
1195 fn dispatch_resident_timed(
1196 &self,
1197 _program: &Program,
1198 _resources: &[Resource],
1199 config: &DispatchConfig,
1200 ) -> Result<TimedDispatchResult, BackendError> {
1201 let index = self.dispatches.fetch_add(1, Ordering::SeqCst) as u64;
1202 assert_eq!(
1203 config.grid_override,
1204 Some([index as u32 + 1, 1, 1]),
1205 "Fix: default resident sequence timing must preserve each step's grid override."
1206 );
1207 Ok(TimedDispatchResult {
1208 outputs: Vec::new(),
1209 wall_ns: 10 + index,
1210 device_ns: Some(7 + index),
1211 enqueue_ns: Some(3 + index),
1212 wait_ns: Some(4 + index),
1213 })
1214 }
1215
1216 fn download_resident_ranges_into(
1217 &self,
1218 ranges: &[(&Resource, usize, usize)],
1219 outputs: &mut [&mut Vec<u8>],
1220 ) -> Result<(), BackendError> {
1221 assert_eq!(ranges.len(), outputs.len());
1222 for ((resource, offset, len), output) in ranges.iter().zip(outputs.iter_mut()) {
1223 let Resource::Resident(id) = resource else {
1224 panic!("Fix: default timed resident sequence test expects resident resources.");
1225 };
1226 output.clear();
1227 output.extend_from_slice(&id.to_le_bytes());
1228 output.extend_from_slice(&(*offset as u64).to_le_bytes());
1229 output.extend_from_slice(&(*len as u64).to_le_bytes());
1230 }
1231 Ok(())
1232 }
1233 }
1234
1235 #[test]
1236 fn default_borrowed_into_dispatch_records_runtime_telemetry() {
1237 let _guard = crate::observability::audit_events_test_lock();
1238 let before = crate::observability::snapshot_dispatch_telemetry();
1239 let backend = TelemetryBackend;
1240 let mut outputs = vec![Vec::with_capacity(4), Vec::with_capacity(1)];
1241
1242 backend
1243 .dispatch_borrowed_into(
1244 &Program::empty(),
1245 &[&[9, 8, 7]],
1246 &DispatchConfig::default(),
1247 &mut outputs,
1248 )
1249 .expect("Fix: default borrowed-into dispatch must succeed");
1250
1251 let telemetry = crate::observability::snapshot_dispatch_telemetry();
1252 assert!(telemetry.launches >= before.launches + 1);
1253 assert!(telemetry.input_bytes >= before.input_bytes + 3);
1254 assert!(telemetry.output_bytes >= before.output_bytes + 4);
1255 assert!(telemetry.output_slots >= before.output_slots + 2);
1256 assert!(telemetry.output_slots_reused >= before.output_slots_reused + 1);
1257 assert!(telemetry.output_slots_moved >= before.output_slots_moved + 1);
1258 assert!(telemetry.output_slots_appended >= before.output_slots_appended);
1259 }
1260
1261 #[test]
1262 fn default_resident_sequence_timing_sums_step_device_times_and_reads_ranges() {
1263 let backend = SequenceTimingBackend {
1264 dispatches: AtomicUsize::new(0),
1265 };
1266 let program = Program::empty();
1267 let first_resources = [Resource::Resident(11)];
1268 let second_resources = [Resource::Resident(22)];
1269 let steps = [
1270 ResidentDispatchStep {
1271 program: &program,
1272 resources: &first_resources,
1273 grid_override: Some([1, 1, 1]),
1274 },
1275 ResidentDispatchStep {
1276 program: &program,
1277 resources: &second_resources,
1278 grid_override: Some([2, 1, 1]),
1279 },
1280 ];
1281 let read_resource = Resource::Resident(33);
1282 let reads = [ResidentReadRange {
1283 resource: &read_resource,
1284 byte_offset: 4,
1285 byte_len: 8,
1286 }];
1287 let mut output = Vec::new();
1288
1289 let timing = backend
1290 .dispatch_resident_sequence_read_ranges_timed_into(&steps, &reads, &mut [&mut output])
1291 .expect("Fix: default timed resident sequence must execute and read ranges.");
1292
1293 assert_eq!(backend.dispatches.load(Ordering::SeqCst), 2);
1294 assert_eq!(timing.device_ns, Some(15));
1295 assert_eq!(timing.enqueue_ns, Some(7));
1296 assert_eq!(timing.wait_ns, Some(9));
1297 assert!(timing.wall_ns > 0);
1298 assert_eq!(output.len(), 24);
1299 assert_eq!(u64::from_le_bytes(output[0..8].try_into().unwrap()), 33);
1300 assert_eq!(u64::from_le_bytes(output[8..16].try_into().unwrap()), 4);
1301 assert_eq!(u64::from_le_bytes(output[16..24].try_into().unwrap()), 8);
1302 }
1303
1304 #[test]
1305 fn default_dispatch_paths_use_shared_fallible_staging_and_checked_timing() {
1306 let backend_source = include_str!("vyre_backend.rs");
1307 let compiled_source = include_str!("compiled_pipeline.rs");
1308 let module_source = include_str!("../backend.rs");
1309
1310 assert!(
1311 module_source.contains("fn clone_borrowed_inputs_for_dispatch")
1312 && module_source.contains("fn reserve_batch_output_slots")
1313 && module_source.contains("fn checked_elapsed_wall_ns"),
1314 "Fix: backend defaults must share one fallible staging and checked timing contract."
1315 );
1316 for source in [backend_source, compiled_source] {
1317 let production = source
1318 .split("#[cfg(test)]")
1319 .next()
1320 .expect("Fix: backend source must contain production section before tests");
1321 assert!(
1322 production.contains("clone_borrowed_inputs_for_dispatch")
1323 && production.contains("checked_elapsed_wall_ns")
1324 && !production.contains(".as_nanos() as u64")
1325 && !production.contains("inputs.iter().map(|input| (*input).to_vec()).collect()"),
1326 "Fix: inherited backend dispatch defaults must avoid infallible borrowed-input collection and lossy wall timing."
1327 );
1328 }
1329 let compiled_production = compiled_source
1330 .split("#[cfg(test)]")
1331 .next()
1332 .expect("Fix: compiled pipeline source must contain production section before tests");
1333 assert!(
1334 compiled_production.contains("reserved_batch_output_slots")
1335 && !compiled_production.contains("Vec::with_capacity(batches.len())"),
1336 "Fix: compiled-pipeline batch defaults must construct output slots through shared fallible staging."
1337 );
1338 }
1339}