pub trait ModelExecutor: Send + Sync {
Show 18 methods
// Required methods
fn info(&self) -> &ModelInfo;
fn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait;
fn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait;
fn capabilities(&self) -> ExecutorCapabilities;
fn status(&self) -> ExecutorStatus;
// Provided methods
fn supports_native_unified_decode(&self) -> bool { ... }
fn kv_capacity(&self) -> Option<usize> { ... }
fn batch_prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [PrefillInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn unified_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
_batch: &'life1 UnifiedBatch,
) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 TensorRef,
) -> Pin<Box<dyn Future<Output = Result<TensorRef>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn truncate_kv<'life0, 'life1, 'async_trait>(
&'life0 self,
_kv_cache: &'life1 Arc<dyn KvCacheHandle>,
_new_len: usize,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn forward_verify<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn cache_metrics_snapshot(&self) -> Option<Value> { ... }
fn lora_metrics_snapshot(&self) -> Option<Value> { ... }
fn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait { ... }
fn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait { ... }
fn release_cache(&self, _cache_id: &str) { ... }
}Expand description
Core model executor trait focusing on tensor operations
Required Methods§
Sourcefn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Execute prefill phase (process initial prompt)
Sourcefn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Execute decode phase (generate next token)
Sourcefn capabilities(&self) -> ExecutorCapabilities
fn capabilities(&self) -> ExecutorCapabilities
Get executor capabilities
Sourcefn status(&self) -> ExecutorStatus
fn status(&self) -> ExecutorStatus
Get current executor status
Provided Methods§
Sourcefn supports_native_unified_decode(&self) -> bool
fn supports_native_unified_decode(&self) -> bool
Whether this executor’s backend can run the unified mixed prefill+decode
forward natively. When false, the engine routes Qwen3-MoE batches through
the legacy split path. Reported by the (backend-aware) executor so the
engine stays backend-agnostic — replaces a cfg(target_os) branch that
previously hard-coded “Metal/CPU lack native unified” in the hot path.
Default false (conservative legacy path); accelerators with a native unified forward override to true.
Sourcefn kv_capacity(&self) -> Option<usize>
fn kv_capacity(&self) -> Option<usize>
Per-request KV capacity in tokens when the executor owns a smaller runtime cache window than the model’s declared context length.
Sourcefn batch_prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [PrefillInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn batch_prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [PrefillInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Batch prefill: process multiple prompts’ prefill in ONE forward pass.
Default implementation falls back to per-request prefill() (serial,
which is the current behavior the engine sees today). Executors that
support unified mixed-batch forward (e.g. via model.unified_forward
over a varlen QKV path) should override this to amortize launch /
kernel-overhead across all inputs items in one call.
Used by the continuous-batching engine to coalesce a cohort of new prefills (apples M3 c=32 sees 32 simultaneous prefills as one logical batch; the serial fallback runs each in ~47 ms while a true batched path runs all 32 in ~100 ms).
Sourcefn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Batch decode: process multiple sequences in one forward pass.
Default implementation falls back to per-request decode().
Executors with batched CUDA runners should override this.
Sourcefn unified_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
_batch: &'life1 UnifiedBatch,
) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn unified_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
_batch: &'life1 UnifiedBatch,
) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Unified mixed-batch forward: process a UnifiedBatch containing
any combination of prefill chunks (one or more q_tokens per item,
possibly continuing from pos_offset > 0) and decode steps
(q_tokens.len() == 1, is_final_chunk = true) in a single model
forward pass.
Returns one element per batch.items[i]:
Some(logits)for items withis_final_chunk = true(the request’s final-position logits, ready for sampling)Nonefor intermediate prefill chunks (no lm_head executed — model only updates KV state)
Default implementation returns Err(unsupported). Concrete LLM
executors should override with either:
- A behavioral fallback that dispatches each chunk via existing
prefill()and groups decode items intobatch_decode()(this preserves current behavior; no perf change), OR - A real unified-forward path that runs all items through one
[M_total, hidden]GEMM chain with a varlen attention kernel (this is the chunked-prefill perf unlock).
Sourcefn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 TensorRef,
) -> Pin<Box<dyn Future<Output = Result<TensorRef>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 TensorRef,
) -> Pin<Box<dyn Future<Output = Result<TensorRef>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Optional: full forward pass (for non-autoregressive use cases)
Sourcefn truncate_kv<'life0, 'life1, 'async_trait>(
&'life0 self,
_kv_cache: &'life1 Arc<dyn KvCacheHandle>,
_new_len: usize,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn truncate_kv<'life0, 'life1, 'async_trait>(
&'life0 self,
_kv_cache: &'life1 Arc<dyn KvCacheHandle>,
_new_len: usize,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Roll the KV cache for this executor’s sequence back to new_len.
Used by speculative decoding on partial rejection so the next
iteration sees a KV prefix that matches the accepted token stream.
Default: Ok(()) — executors that don’t cache per-sequence state
(stub, mock) are inherently tolerant; real LLM executors override.
Sourcefn forward_verify<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn forward_verify<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Multi-position decode-verify: one forward over N+1 tokens,
producing one logits row per position. Used by speculative
decoding’s target path so we don’t pay N+1 sequential forwards.
Default falls back to N+1 sequential decode() calls — correct
but slow; real LLM executors override.
Returns a Vec<DecodeOutput> of length inputs.len() with the
final KV handle attached to the last element.
Sourcefn cache_metrics_snapshot(&self) -> Option<Value>
fn cache_metrics_snapshot(&self) -> Option<Value>
Optional model/executor cache metrics.
Concrete LLM executors use this for model-level paged KV prefix reuse counters. Default implementations keep non-autoregressive executors and tests from needing cache-specific plumbing.
Sourcefn lora_metrics_snapshot(&self) -> Option<Value>
fn lora_metrics_snapshot(&self) -> Option<Value>
Optional LoRA runtime metrics.
Sourcefn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
fn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
Warm up executor (load model, allocate memory, etc.)
Sourcefn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
fn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
Shutdown executor gracefully
Sourcefn release_cache(&self, _cache_id: &str)
fn release_cache(&self, _cache_id: &str)
Release KV cache and state for a completed sequence.
Called by the engine when a request finishes (success or error) to free
GPU memory held by the sequence’s KV cache. The cache_id matches the
value embedded in the KvCacheHandle returned by prefill/decode.
Dyn Compatibility§
This trait is dyn compatible.
In older versions of Rust, dyn compatibility was called "object safety".