pub struct KvFp16;Expand description
FP16 KV cache (the existing default on CUDA + Metal).
Trait Implementations§
Source§impl BackendKvDtype<KvFp16> for CpuBackend
impl BackendKvDtype<KvFp16> for CpuBackend
Source§impl KvDtypeKind for KvFp16
impl KvDtypeKind for KvFp16
Source§impl<B: Backend + BackendPagedKv> KvLayer<B> for KvFp16
impl<B: Backend + BackendPagedKv> KvLayer<B> for KvFp16
Source§fn alloc_paged(
max_blocks_per_seq: usize,
block_size: usize,
num_kv_heads: usize,
head_dim: usize,
) -> Self::Layer
fn alloc_paged( max_blocks_per_seq: usize, block_size: usize, num_kv_heads: usize, head_dim: usize, ) -> Self::Layer
Allocate a paged cache layer for one sequence.
Source§fn alloc_contig(
capacity: usize,
num_kv_heads: usize,
head_dim: usize,
) -> Self::Layer
fn alloc_contig( capacity: usize, num_kv_heads: usize, head_dim: usize, ) -> Self::Layer
Allocate a contiguous cache layer (FP16 only; INT8 panics).
fn len(layer: &Self::Layer) -> usize
fn set_len(layer: &mut Self::Layer, new_len: usize)
fn capacity(layer: &Self::Layer) -> usize
fn block_size(layer: &Self::Layer) -> usize
fn num_kv_heads(layer: &Self::Layer) -> usize
fn head_dim(layer: &Self::Layer) -> usize
fn block_table(layer: &Self::Layer) -> Option<&B::Buffer>
fn block_table_mut(layer: &mut Self::Layer) -> Option<&mut B::Buffer>
fn context_lens(layer: &Self::Layer) -> Option<&B::Buffer>
fn context_lens_mut(layer: &mut Self::Layer) -> Option<&mut B::Buffer>
fn paged_block_indices(layer: &Self::Layer) -> &[u32]
fn paged_block_indices_mut(layer: &mut Self::Layer) -> &mut Vec<u32>
Source§fn paged_write(
ctx: &mut B::Context,
layer: &mut Self::Layer,
qkv: &B::Buffer,
q_norm_w: &B::Buffer,
k_norm_w: &B::Buffer,
cos: &B::Buffer,
sin: &B::Buffer,
q_out: &mut B::Buffer,
_k_scratch: &mut B::Buffer,
_v_scratch: &mut B::Buffer,
pool_k: &mut B::Buffer,
pool_v: &mut B::Buffer,
tokens: usize,
num_q_heads: usize,
num_kv_heads: usize,
head_dim: usize,
pos_offset: usize,
eps: f32,
qk_mode: i32,
) -> Result<()>
fn paged_write( ctx: &mut B::Context, layer: &mut Self::Layer, qkv: &B::Buffer, q_norm_w: &B::Buffer, k_norm_w: &B::Buffer, cos: &B::Buffer, sin: &B::Buffer, q_out: &mut B::Buffer, _k_scratch: &mut B::Buffer, _v_scratch: &mut B::Buffer, pool_k: &mut B::Buffer, pool_v: &mut B::Buffer, tokens: usize, num_q_heads: usize, num_kv_heads: usize, head_dim: usize, pos_offset: usize, eps: f32, qk_mode: i32, ) -> Result<()>
Paged write: split QKV → norm → RoPE → write K/V into the paged
pool. FP16 uses
B::split_qkv_norm_rope_into_paged_cache. INT8
uses B::split_qkv_norm_rope + B::int8_kv_append_paged.Source§fn paged_decode_attention(
ctx: &mut B::Context,
layer: &mut Self::Layer,
q: &B::Buffer,
pool_k: &B::Buffer,
pool_v: &B::Buffer,
output: &mut B::Buffer,
num_q_heads: usize,
num_kv_heads: usize,
head_dim: usize,
final_kv_len: usize,
tokens: usize,
) -> Result<()>
fn paged_decode_attention( ctx: &mut B::Context, layer: &mut Self::Layer, q: &B::Buffer, pool_k: &B::Buffer, pool_v: &B::Buffer, output: &mut B::Buffer, num_q_heads: usize, num_kv_heads: usize, head_dim: usize, final_kv_len: usize, tokens: usize, ) -> Result<()>
Paged decode attention. Reads from the per-layer cache, writes the
attended output to
output. FP16 reads from pool_k/pool_v;
INT8 reads from layer-internal INT8 buffers (pool args ignored).Source§fn contig_write(
ctx: &mut B::Context,
layer: &mut Self::Layer,
qkv: &B::Buffer,
q_norm_w: &B::Buffer,
k_norm_w: &B::Buffer,
cos: &B::Buffer,
sin: &B::Buffer,
q_out: &mut B::Buffer,
k_scratch: &mut B::Buffer,
v_scratch: &mut B::Buffer,
q_buf: &mut B::Buffer,
k_buf: &mut B::Buffer,
v_buf: &mut B::Buffer,
tokens: usize,
num_q_heads: usize,
num_kv_heads: usize,
head_dim: usize,
pos_offset: usize,
eps: f32,
qk_mode: i32,
) -> Result<()>
fn contig_write( ctx: &mut B::Context, layer: &mut Self::Layer, qkv: &B::Buffer, q_norm_w: &B::Buffer, k_norm_w: &B::Buffer, cos: &B::Buffer, sin: &B::Buffer, q_out: &mut B::Buffer, k_scratch: &mut B::Buffer, v_scratch: &mut B::Buffer, q_buf: &mut B::Buffer, k_buf: &mut B::Buffer, v_buf: &mut B::Buffer, tokens: usize, num_q_heads: usize, num_kv_heads: usize, head_dim: usize, pos_offset: usize, eps: f32, qk_mode: i32, ) -> Result<()>
Contig write: FP16 only. INT8 inherits the panic default —
KvInt8::alloc_contig panics in ensure_kv, so this branch is
dead code on the INT8 path.Source§fn contig_decode_attention(
ctx: &mut B::Context,
layer: &Self::Layer,
q: &B::Buffer,
output: &mut B::Buffer,
attn_cfg: AttnConfig,
tokens: usize,
pos_offset: usize,
) -> Result<()>
fn contig_decode_attention( ctx: &mut B::Context, layer: &Self::Layer, q: &B::Buffer, output: &mut B::Buffer, attn_cfg: AttnConfig, tokens: usize, pos_offset: usize, ) -> Result<()>
Contig decode attention: FP16 only.
fn is_paged(layer: &Self::Layer) -> bool
Auto Trait Implementations§
impl Freeze for KvFp16
impl RefUnwindSafe for KvFp16
impl Send for KvFp16
impl Sync for KvFp16
impl Unpin for KvFp16
impl UnsafeUnpin for KvFp16
impl UnwindSafe for KvFp16
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more