pub trait ModelExecutor: Send + Sync {
// Required methods
fn info(&self) -> &ModelInfo;
fn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait;
fn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait;
fn capabilities(&self) -> ExecutorCapabilities;
fn status(&self) -> ExecutorStatus;
// Provided methods
fn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 TensorRef,
) -> Pin<Box<dyn Future<Output = Result<TensorRef>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait { ... }
fn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait { ... }
fn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait,
'life0: 'async_trait { ... }
fn release_cache(&self, _cache_id: &str) { ... }
}Expand description
Core model executor trait focusing on tensor operations
Required Methods§
Sourcefn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Execute prefill phase (process initial prompt)
Sourcefn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Execute decode phase (generate next token)
Sourcefn capabilities(&self) -> ExecutorCapabilities
fn capabilities(&self) -> ExecutorCapabilities
Get executor capabilities
Sourcefn status(&self) -> ExecutorStatus
fn status(&self) -> ExecutorStatus
Get current executor status
Provided Methods§
Sourcefn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Batch decode: process multiple sequences in one forward pass.
Default implementation falls back to per-request decode().
Executors with batched CUDA runners should override this.
Sourcefn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 TensorRef,
) -> Pin<Box<dyn Future<Output = Result<TensorRef>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 TensorRef,
) -> Pin<Box<dyn Future<Output = Result<TensorRef>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Optional: full forward pass (for non-autoregressive use cases)
Sourcefn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
fn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
Warm up executor (load model, allocate memory, etc.)
Sourcefn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
fn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
Shutdown executor gracefully
Sourcefn release_cache(&self, _cache_id: &str)
fn release_cache(&self, _cache_id: &str)
Release KV cache and state for a completed sequence.
Called by the engine when a request finishes (success or error) to free
GPU memory held by the sequence’s KV cache. The cache_id matches the
value embedded in the KvCacheHandle returned by prefill/decode.