pub trait AttentionOps: Send + Sync {
// Required method
fn attention(
&self,
q: &TensorRef,
k: &TensorRef,
v: &TensorRef,
params: &AttentionParams,
) -> Result<TensorRef>;
// Provided method
fn paged_attention(
&self,
_q: &TensorRef,
_k_cache: &TensorRef,
_v_cache: &TensorRef,
_block_table: &[u32],
_params: &AttentionParams,
) -> Result<TensorRef> { ... }
}Expand description
Attention operations.
Required Methods§
Sourcefn attention(
&self,
q: &TensorRef,
k: &TensorRef,
v: &TensorRef,
params: &AttentionParams,
) -> Result<TensorRef>
fn attention( &self, q: &TensorRef, k: &TensorRef, v: &TensorRef, params: &AttentionParams, ) -> Result<TensorRef>
Standard multi-head / grouped-query attention.
q—[batch, seq_q, num_heads, head_dim]k—[batch, seq_kv, num_kv_heads, head_dim]v—[batch, seq_kv, num_kv_heads, head_dim]
Returns attention output [batch, seq_q, num_heads, head_dim].
Provided Methods§
Sourcefn paged_attention(
&self,
_q: &TensorRef,
_k_cache: &TensorRef,
_v_cache: &TensorRef,
_block_table: &[u32],
_params: &AttentionParams,
) -> Result<TensorRef>
fn paged_attention( &self, _q: &TensorRef, _k_cache: &TensorRef, _v_cache: &TensorRef, _block_table: &[u32], _params: &AttentionParams, ) -> Result<TensorRef>
Paged attention for KV-cache-based decode.
Default returns unsupported — backends opt in.