1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
//! Pre-compiled pipeline trait.
use crate::backend::{
private, BackendError, DispatchConfig, OutputBuffers, Resource, TimedDispatchResult,
};
/// A program that has been pre-compiled by a backend, ready for repeated
/// dispatch with new inputs without paying compilation cost on each call.
///
/// Build one with [`crate::pipeline::compile`]. Backends that override
/// [`crate::backend::VyreBackend::compile_native`] return a cached pipeline (skipping
/// shader compilation, pipeline-layout creation, and bind-group-layout
/// creation on every dispatch); backends that don't get a transparent
/// passthrough whose semantics are identical to repeated [`crate::backend::VyreBackend::dispatch`].
///
/// `CompiledPipeline::dispatch` MUST be bit-identical to
/// `VyreBackend::dispatch(program, inputs, config)` for the program this
/// pipeline was compiled from. Any divergence is a backend bug.
pub trait CompiledPipeline: private::Sealed + Send + Sync {
/// Stable identifier for this pipeline (typically `<backend>:<program-fingerprint>`).
///
/// Used by certificates and debugging to confirm a particular cached
/// pipeline was reused vs recompiled.
fn id(&self) -> &str;
/// Dispatch the precompiled pipeline with new inputs.
///
/// Bit-identical to `VyreBackend::dispatch(self.program, inputs, config)`.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch.
/// The error message always includes a `Fix: ` remediation section.
fn dispatch(
&self,
inputs: &[Vec<u8>],
config: &DispatchConfig,
) -> Result<Vec<Vec<u8>>, BackendError>;
/// Dispatch the precompiled pipeline with borrowed input buffers.
///
/// Backends may override this to bind caller-owned byte slices directly.
/// The default allocates the owned input vector once, preserving the
/// existing [`CompiledPipeline::dispatch`] contract for current backends.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch.
fn dispatch_borrowed(
&self,
inputs: &[&[u8]],
config: &DispatchConfig,
) -> Result<Vec<Vec<u8>>, BackendError> {
let owned = crate::backend::clone_borrowed_inputs_for_dispatch(
inputs,
"compiled pipeline input staging",
)?;
let outputs = self.dispatch(&owned, config)?;
crate::observability::record_dispatch_io(inputs, &outputs);
Ok(outputs)
}
/// Dispatch with backend-owned timing.
///
/// Default timing is host wall time. Native pipeline implementations may
/// attach device elapsed time without exposing driver APIs to callers.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch.
fn dispatch_borrowed_timed(
&self,
inputs: &[&[u8]],
config: &DispatchConfig,
) -> Result<TimedDispatchResult, BackendError> {
let started = std::time::Instant::now();
let outputs = self.dispatch_borrowed(inputs, config)?;
Ok(TimedDispatchResult {
outputs,
wall_ns: crate::backend::checked_elapsed_wall_ns(
started,
"compiled pipeline dispatch",
)?,
device_ns: None,
enqueue_ns: None,
wait_ns: None,
})
}
/// Dispatch the precompiled pipeline with borrowed inputs and write
/// outputs into caller-owned storage.
///
/// Backends may override this to reuse output buffers across repeated
/// dispatches. The default preserves the existing return-value contract and
/// copies returned bytes into existing output slots where possible.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch.
fn dispatch_borrowed_into(
&self,
inputs: &[&[u8]],
config: &DispatchConfig,
outputs: &mut OutputBuffers,
) -> Result<(), BackendError> {
let result = self.dispatch_borrowed(inputs, config)?;
let stats = crate::backend::replace_output_buffers_preserving_slots_with_memory_stats(
result, outputs,
);
crate::observability::record_output_replacement_stats(stats);
Ok(())
}
/// Dispatch several independent borrowed-input submissions for the same
/// compiled program.
///
/// Backends with native queues/streams should override this to enqueue the
/// whole batch before waiting for readback. The default is intentionally
/// semantic, not fast: it preserves bit-identical behavior for backends
/// that only implement the single-dispatch path.
///
/// # Errors
///
/// Returns [`BackendError`] when any item cannot complete dispatch.
fn dispatch_borrowed_batched(
&self,
batches: &[&[&[u8]]],
config: &DispatchConfig,
) -> Result<Vec<OutputBuffers>, BackendError> {
let mut outputs = crate::backend::reserved_batch_output_slots(
batches.len(),
"compiled borrowed batch outputs",
)?;
self.dispatch_borrowed_batched_into(batches, config, &mut outputs)?;
Ok(outputs)
}
/// Dispatch several borrowed-input submissions and write every item's
/// outputs into caller-owned storage.
///
/// The outer vector is one entry per batch item. Each inner
/// [`OutputBuffers`] preserves already-allocated output slots where the
/// backend can collect directly into caller storage. This is the hot
/// repeated-dispatch contract: callers can keep one output arena per batch
/// lane instead of rebuilding `Vec<Vec<u8>>` shells after every launch.
///
/// # Errors
///
/// Returns [`BackendError`] when any item cannot complete dispatch.
fn dispatch_borrowed_batched_into(
&self,
batches: &[&[&[u8]]],
config: &DispatchConfig,
outputs: &mut Vec<OutputBuffers>,
) -> Result<(), BackendError> {
crate::backend::resize_batch_output_slots(
outputs,
batches.len(),
"compiled borrowed batch outputs",
)?;
for (batch, slot) in batches.iter().zip(outputs.iter_mut()) {
self.dispatch_borrowed_into(batch, config, slot)?;
}
Ok(())
}
/// Dispatch the precompiled pipeline with mixed host/resident handles.
///
/// This is the P-41 contract: keep control, ring, IO, and debug buffers
/// GPU-resident across launches.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch.
fn dispatch_persistent_handles(
&self,
_inputs: &[Resource],
_config: &DispatchConfig,
) -> Result<OutputBuffers, BackendError> {
Err(BackendError::UnsupportedFeature {
name: "persistent handle dispatch".to_string(),
backend: "unspecified".to_string(),
})
}
/// Dispatch the precompiled pipeline with mixed host/resident handles and
/// write readback bytes into caller-owned output storage.
///
/// This is the single-submission resident reuse contract. Backends that
/// still need host-visible results must fill existing output slots instead
/// of forcing callers to rebuild `Vec<Vec<u8>>` shells on every resident
/// dispatch.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch.
fn dispatch_persistent_handles_into(
&self,
inputs: &[Resource],
config: &DispatchConfig,
outputs: &mut OutputBuffers,
) -> Result<(), BackendError> {
let result = self.dispatch_persistent_handles(inputs, config)?;
crate::observability::record_dispatch_io(&[], &result);
let stats = crate::backend::replace_output_buffers_preserving_slots_with_memory_stats(
result, outputs,
);
crate::observability::record_output_replacement_stats(stats);
Ok(())
}
/// Dispatch the precompiled pipeline with resident handles and return
/// resident resources for its ordered outputs without host readback.
///
/// This is the zero-copy chaining contract for multi-stage GPU pipelines:
/// callers allocate resident resources for every non-shared binding, pass
/// them in binding order, and receive the output subset in stable output
/// order so those buffers can feed later kernels directly. The returned
/// resources remain owned by the backend and must be freed by the caller
/// through [`crate::backend::VyreBackend::free_resident`] when no longer
/// needed.
///
/// # Errors
///
/// Returns [`BackendError`] when the backend cannot complete dispatch or
/// cannot preserve outputs as resident resources.
fn dispatch_persistent_resource_outputs(
&self,
_inputs: &[Resource],
_config: &DispatchConfig,
) -> Result<Vec<Resource>, BackendError> {
Err(BackendError::UnsupportedFeature {
name: "persistent resident output dispatch".to_string(),
backend: "unspecified".to_string(),
})
}
/// Dispatch several resident-handle submissions for the same compiled
/// program.
///
/// Native backends should override this to record/replay the batch through
/// one device submission or graph replay. The default preserves semantics
/// for backends that only implement the single-submission resident path.
///
/// # Errors
///
/// Returns [`BackendError`] when any item cannot complete dispatch.
fn dispatch_persistent_handles_batched(
&self,
batches: &[&[Resource]],
config: &DispatchConfig,
) -> Result<Vec<OutputBuffers>, BackendError> {
let mut outputs = crate::backend::reserved_batch_output_slots(
batches.len(),
"compiled resident batch outputs",
)?;
self.dispatch_persistent_handles_batched_into(batches, config, &mut outputs)?;
Ok(outputs)
}
/// Dispatch several resident-handle submissions and write readbacks into
/// caller-owned batch output storage.
///
/// This is the resident equivalent of
/// [`CompiledPipeline::dispatch_borrowed_batched_into`]. It keeps repeated
/// megakernel/dataflow evaluations from rebuilding host output shells when
/// readback is still requested.
///
/// # Errors
///
/// Returns [`BackendError`] when any item cannot complete dispatch.
fn dispatch_persistent_handles_batched_into(
&self,
batches: &[&[Resource]],
config: &DispatchConfig,
outputs: &mut Vec<OutputBuffers>,
) -> Result<(), BackendError> {
crate::backend::resize_batch_output_slots(
outputs,
batches.len(),
"compiled resident batch outputs",
)?;
for (batch, slot) in batches.iter().zip(outputs.iter_mut()) {
self.dispatch_persistent_handles_into(batch, config, slot)?;
}
Ok(())
}
/// Dispatch several fixed megakernel ABI resident-resource rows directly.
///
/// Megakernel resident dispatch always submits exactly four resources:
/// control, ring, debug log, and IO queue. This hook lets native backends
/// consume that fixed row shape without the runtime rebuilding a transient
/// `Vec<&[Resource]>` around every hot batch. Backends that only implement
/// the generic slice batch path inherit the semantic adapter below.
///
/// # Errors
///
/// Returns [`BackendError`] when any row cannot complete dispatch.
fn dispatch_persistent_handle_rows_into(
&self,
rows: &[[Resource; 4]],
config: &DispatchConfig,
outputs: &mut Vec<OutputBuffers>,
) -> Result<(), BackendError> {
let batches = borrowed_resource_rows(rows)?;
self.dispatch_persistent_handles_batched_into(&batches, config, outputs)
}
}
fn borrowed_resource_rows(rows: &[[Resource; 4]]) -> Result<Vec<&[Resource]>, BackendError> {
let mut batches = Vec::new();
batches
.try_reserve_exact(rows.len())
.map_err(|error| BackendError::InvalidProgram {
fix: format!(
"Fix: failed to reserve {} fixed megakernel resident row view(s): {error}. Split the resident batch or override dispatch_persistent_handle_rows_into natively.",
rows.len()
),
})?;
batches.extend(rows.iter().map(|row| row.as_slice()));
Ok(batches)
}