Skip to main content

ModelExecutor

Trait ModelExecutor 

Source
pub trait ModelExecutor: Send + Sync {
    // Required methods
    fn info(&self) -> &ModelInfo;
    fn prefill<'life0, 'life1, 'async_trait>(
        &'life0 self,
        input: &'life1 PrefillInput,
    ) -> Pin<Box<dyn Future<Output = Result<PrefillOutput, FerrumError>> + Send + 'async_trait>>
       where 'life0: 'async_trait,
             'life1: 'async_trait,
             Self: 'async_trait;
    fn decode<'life0, 'life1, 'async_trait>(
        &'life0 self,
        input: &'life1 DecodeInput,
    ) -> Pin<Box<dyn Future<Output = Result<DecodeOutput, FerrumError>> + Send + 'async_trait>>
       where 'life0: 'async_trait,
             'life1: 'async_trait,
             Self: 'async_trait;
    fn capabilities(&self) -> ExecutorCapabilities;
    fn status(&self) -> ExecutorStatus;

    // Provided methods
    fn batch_decode<'life0, 'life1, 'async_trait>(
        &'life0 self,
        inputs: &'life1 [DecodeInput],
    ) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>
       where 'life0: 'async_trait,
             'life1: 'async_trait,
             Self: 'async_trait { ... }
    fn forward<'life0, 'life1, 'async_trait>(
        &'life0 self,
        _input: &'life1 Arc<dyn TensorLike>,
    ) -> Pin<Box<dyn Future<Output = Result<Arc<dyn TensorLike>, FerrumError>> + Send + 'async_trait>>
       where 'life0: 'async_trait,
             'life1: 'async_trait,
             Self: 'async_trait { ... }
    fn warmup<'life0, 'async_trait>(
        &'life0 mut self,
    ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
       where 'life0: 'async_trait,
             Self: 'async_trait { ... }
    fn shutdown<'life0, 'async_trait>(
        &'life0 mut self,
    ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
       where 'life0: 'async_trait,
             Self: 'async_trait { ... }
    fn release_cache(&self, _cache_id: &str) { ... }
}
Expand description

Core model executor trait focusing on tensor operations

Required Methods§

Source

fn info(&self) -> &ModelInfo

Get model information and metadata

Source

fn prefill<'life0, 'life1, 'async_trait>( &'life0 self, input: &'life1 PrefillInput, ) -> Pin<Box<dyn Future<Output = Result<PrefillOutput, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

Execute prefill phase (process initial prompt)

Source

fn decode<'life0, 'life1, 'async_trait>( &'life0 self, input: &'life1 DecodeInput, ) -> Pin<Box<dyn Future<Output = Result<DecodeOutput, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

Execute decode phase (generate next token)

Source

fn capabilities(&self) -> ExecutorCapabilities

Get executor capabilities

Source

fn status(&self) -> ExecutorStatus

Get current executor status

Provided Methods§

Source

fn batch_decode<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [DecodeInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

Batch decode: process multiple sequences in one forward pass.

Default implementation falls back to per-request decode(). Executors with batched CUDA runners should override this.

Source

fn forward<'life0, 'life1, 'async_trait>( &'life0 self, _input: &'life1 Arc<dyn TensorLike>, ) -> Pin<Box<dyn Future<Output = Result<Arc<dyn TensorLike>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

Optional: full forward pass (for non-autoregressive use cases)

Source

fn warmup<'life0, 'async_trait>( &'life0 mut self, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, Self: 'async_trait,

Warm up executor (load model, allocate memory, etc.)

Source

fn shutdown<'life0, 'async_trait>( &'life0 mut self, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, Self: 'async_trait,

Shutdown executor gracefully

Source

fn release_cache(&self, _cache_id: &str)

Release KV cache and state for a completed sequence.

Called by the engine when a request finishes (success or error) to free GPU memory held by the sequence’s KV cache. The cache_id matches the value embedded in the KvCacheHandle returned by prefill/decode.

Implementors§