Struct PagedAttentionExecutor

Source

pub struct PagedAttentionExecutor { /* private fields */ }

Expand description

A model executor that actually uses paged KV cache for attention.

Uses identity projections: for each token, the embedding is a one-hot vector of length num_kv_heads * head_dim derived from the token ID. Q = K = V = embedding. This makes attention outputs deterministic and verifiable.

Logits are produced by summing attention output elements per head and distributing across vocab positions, so different attention patterns produce different token predictions.

Struct PagedAttentionExecutor Copy item path

Implementations§

impl PagedAttentionExecutor

pub fn new( config: PagedExecutorConfig, kv_manager: Arc<PagedKvCacheManager>, ) -> Self

pub fn prefill_count(&self) -> u64

pub fn decode_count(&self) -> u64

Trait Implementations§

impl ModelExecutor for PagedAttentionExecutor

fn info(&self) -> &ModelInfo

fn prefill<'life0, 'life1, 'async_trait>( &'life0 self, input: &'life1 PrefillInput, ) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait,

fn decode<'life0, 'life1, 'async_trait>( &'life0 self, input: &'life1 DecodeInput, ) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait,

fn capabilities(&self) -> ExecutorCapabilities

fn status(&self) -> ExecutorStatus

fn supports_native_unified_decode(&self) -> bool

fn kv_capacity(&self) -> Option<usize>

fn batch_prefill<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [PrefillInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>, FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn batch_decode<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [DecodeInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn unified_decode<'life0, 'life1, 'async_trait>( &'life0 self, _batch: &'life1 UnifiedBatch, ) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>, FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn forward<'life0, 'life1, 'async_trait>( &'life0 self, _input: &'life1 Arc<dyn TensorLike>, ) -> Pin<Box<dyn Future<Output = Result<Arc<dyn TensorLike>, FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn truncate_kv<'life0, 'life1, 'async_trait>( &'life0 self, _kv_cache: &'life1 Arc<dyn KvCacheHandle>, _new_len: usize, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn forward_verify<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [DecodeInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn cache_metrics_snapshot(&self) -> Option<Value>

fn lora_metrics_snapshot(&self) -> Option<Value>

fn warmup<'life0, 'async_trait>( &'life0 mut self, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, Self: 'async_trait,

fn shutdown<'life0, 'async_trait>( &'life0 mut self, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>where 'life0: 'async_trait, Self: 'async_trait,

fn release_cache(&self, _cache_id: &str)

Auto Trait Implementations§

impl !Freeze for PagedAttentionExecutor

impl !RefUnwindSafe for PagedAttentionExecutor

impl !UnwindSafe for PagedAttentionExecutor

impl Send for PagedAttentionExecutor

impl Sync for PagedAttentionExecutor

impl Unpin for PagedAttentionExecutor

impl UnsafeUnpin for PagedAttentionExecutor

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> Same for T

type Output = T

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

Struct PagedAttentionExecutor

fn prefill<'life0, 'life1, 'async_trait>( &'life0 self, input: &'life1 PrefillInput, ) -> Pin<Box<dyn Future<Output = Result<PrefillOutput>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait,

fn decode<'life0, 'life1, 'async_trait>( &'life0 self, input: &'life1 DecodeInput, ) -> Pin<Box<dyn Future<Output = Result<DecodeOutput>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait,

fn batch_prefill<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [PrefillInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<PrefillOutput>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn batch_decode<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [DecodeInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn unified_decode<'life0, 'life1, 'async_trait>( &'life0 self, _batch: &'life1 UnifiedBatch, ) -> Pin<Box<dyn Future<Output = Result<Vec<Option<Vec<f32>>>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn forward<'life0, 'life1, 'async_trait>( &'life0 self, _input: &'life1 Arc<dyn TensorLike>, ) -> Pin<Box<dyn Future<Output = Result<Arc<dyn TensorLike>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn truncate_kv<'life0, 'life1, 'async_trait>( &'life0 self, _kv_cache: &'life1 Arc<dyn KvCacheHandle>, _new_len: usize, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn forward_verify<'life0, 'life1, 'async_trait>( &'life0 self, inputs: &'life1 [DecodeInput], ) -> Pin<Box<dyn Future<Output = Result<Vec<DecodeOutput>, FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, 'life1: 'async_trait, Self: 'async_trait,

fn warmup<'life0, 'async_trait>( &'life0 mut self, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, Self: 'async_trait,

fn shutdown<'life0, 'async_trait>( &'life0 mut self, ) -> Pin<Box<dyn Future<Output = Result<(), FerrumError>> + Send + 'async_trait>>
where 'life0: 'async_trait, Self: 'async_trait,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,