pub struct PagedAttentionExecutor { /* private fields */ }Expand description
A model executor that actually uses paged KV cache for attention.
Uses identity projections: for each token, the embedding is a one-hot
vector of length num_kv_heads * head_dim derived from the token ID.
Q = K = V = embedding. This makes attention outputs deterministic
and verifiable.
Logits are produced by summing attention output elements per head and distributing across vocab positions, so different attention patterns produce different token predictions.
Implementations§
Source§impl PagedAttentionExecutor
impl PagedAttentionExecutor
pub fn new( config: PagedExecutorConfig, kv_manager: Arc<PagedKvCacheManager>, ) -> Self
pub fn prefill_count(&self) -> u64
pub fn decode_count(&self) -> u64
Trait Implementations§
Source§impl ModelExecutor for PagedAttentionExecutor
impl ModelExecutor for PagedAttentionExecutor
Source§fn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 PrefillInput,
) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Execute prefill phase (process initial prompt)
Source§fn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
fn decode<'life0, 'life1, 'async_trait>(
&'life0 self,
input: &'life1 DecodeInput,
) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where
Self: 'async_trait,
'life0: 'async_trait,
'life1: 'async_trait,
Execute decode phase (generate next token)
Source§fn capabilities(&self) -> ExecutorCapabilities
fn capabilities(&self) -> ExecutorCapabilities
Get executor capabilities
Source§fn status(&self) -> ExecutorStatus
fn status(&self) -> ExecutorStatus
Get current executor status
Source§fn supports_native_unified_decode(&self) -> bool
fn supports_native_unified_decode(&self) -> bool
Whether this executor’s backend can run the unified mixed prefill+decode
forward natively. When false, the engine routes Qwen3-MoE batches through
the legacy split path. Reported by the (backend-aware) executor so the
engine stays backend-agnostic — replaces a
cfg(target_os) branch that
previously hard-coded “Metal/CPU lack native unified” in the hot path. Read moreSource§fn kv_capacity(&self) -> Option<usize>
fn kv_capacity(&self) -> Option<usize>
Per-request KV capacity in tokens when the executor owns a smaller
runtime cache window than the model’s declared context length.
Source§fn batch_prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [PrefillInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn batch_prefill<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [PrefillInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
Batch prefill: process multiple prompts’ prefill in ONE forward pass. Read more
Source§fn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn batch_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
Batch decode: process multiple sequences in one forward pass. Read more
Source§fn unified_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
_batch: &'life1 UnifiedBatch,
) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn unified_decode<'life0, 'life1, 'async_trait>(
&'life0 self,
_batch: &'life1 UnifiedBatch,
) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
Unified mixed-batch forward: process a
UnifiedBatch containing
any combination of prefill chunks (one or more q_tokens per item,
possibly continuing from pos_offset > 0) and decode steps
(q_tokens.len() == 1, is_final_chunk = true) in a single model
forward pass. Read moreSource§fn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 Arc<dyn TensorLike>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn TensorLike>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn forward<'life0, 'life1, 'async_trait>(
&'life0 self,
_input: &'life1 Arc<dyn TensorLike>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn TensorLike>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
Optional: full forward pass (for non-autoregressive use cases)
Source§fn truncate_kv<'life0, 'life1, 'async_trait>(
&'life0 self,
_kv_cache: &'life1 Arc<dyn KvCacheHandle>,
_new_len: usize,
) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn truncate_kv<'life0, 'life1, 'async_trait>(
&'life0 self,
_kv_cache: &'life1 Arc<dyn KvCacheHandle>,
_new_len: usize,
) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
Roll the KV cache for this executor’s sequence back to
new_len.
Used by speculative decoding on partial rejection so the next
iteration sees a KV prefix that matches the accepted token stream.
Default: Ok(()) — executors that don’t cache per-sequence state
(stub, mock) are inherently tolerant; real LLM executors override.Source§fn forward_verify<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
fn forward_verify<'life0, 'life1, 'async_trait>(
&'life0 self,
inputs: &'life1 [DecodeInput],
) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
Self: 'async_trait,
Multi-position decode-verify: one forward over
N+1 tokens,
producing one logits row per position. Used by speculative
decoding’s target path so we don’t pay N+1 sequential forwards. Read moreSource§fn cache_metrics_snapshot(&self) -> Option<Value>
fn cache_metrics_snapshot(&self) -> Option<Value>
Optional model/executor cache metrics. Read more
Source§fn lora_metrics_snapshot(&self) -> Option<Value>
fn lora_metrics_snapshot(&self) -> Option<Value>
Optional LoRA runtime metrics.
Source§fn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
Self: 'async_trait,
fn warmup<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
Self: 'async_trait,
Warm up executor (load model, allocate memory, etc.)
Source§fn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
Self: 'async_trait,
fn shutdown<'life0, 'async_trait>(
&'life0 mut self,
) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where
'life0: 'async_trait,
Self: 'async_trait,
Shutdown executor gracefully
Source§fn release_cache(&self, _cache_id: &str)
fn release_cache(&self, _cache_id: &str)
Release KV cache and state for a completed sequence. Read more
Auto Trait Implementations§
impl !Freeze for PagedAttentionExecutor
impl !RefUnwindSafe for PagedAttentionExecutor
impl !UnwindSafe for PagedAttentionExecutor
impl Send for PagedAttentionExecutor
impl Sync for PagedAttentionExecutor
impl Unpin for PagedAttentionExecutor
impl UnsafeUnpin for PagedAttentionExecutor
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more