pub struct KvInt8;Expand description
INT8 KV cache — half the memory of FP16 with per-token / per-channel scale factors. CUDA path planned via vLLM’s quant_kv kernels.
Trait Implementations§
Source§impl KvDtypeKind for KvInt8
impl KvDtypeKind for KvInt8
Source§impl<B: Backend + BackendInt8KvOps> KvLayer<B> for KvInt8
impl<B: Backend + BackendInt8KvOps> KvLayer<B> for KvInt8
Source§type Layer = KvCacheQuant<B, KvInt8>
type Layer = KvCacheQuant<B, KvInt8>
Per-layer cache type (FP16 →
KvCache, INT8 → KvCacheQuant).Source§fn alloc_paged(
max_blocks_per_seq: usize,
block_size: usize,
num_kv_heads: usize,
head_dim: usize,
) -> Self::Layer
fn alloc_paged( max_blocks_per_seq: usize, block_size: usize, num_kv_heads: usize, head_dim: usize, ) -> Self::Layer
Allocate a paged cache layer for one sequence.
Source§fn alloc_contig(
_capacity: usize,
_num_kv_heads: usize,
_head_dim: usize,
) -> Self::Layer
fn alloc_contig( _capacity: usize, _num_kv_heads: usize, _head_dim: usize, ) -> Self::Layer
Allocate a contiguous cache layer (FP16 only; INT8 panics).
fn len(layer: &Self::Layer) -> usize
fn set_len(layer: &mut Self::Layer, new_len: usize)
fn capacity(layer: &Self::Layer) -> usize
fn block_size(layer: &Self::Layer) -> usize
fn num_kv_heads(layer: &Self::Layer) -> usize
fn head_dim(layer: &Self::Layer) -> usize
fn block_table(layer: &Self::Layer) -> Option<&B::Buffer>
fn block_table_mut(layer: &mut Self::Layer) -> Option<&mut B::Buffer>
fn context_lens(layer: &Self::Layer) -> Option<&B::Buffer>
fn context_lens_mut(layer: &mut Self::Layer) -> Option<&mut B::Buffer>
fn paged_block_indices(layer: &Self::Layer) -> &[u32]
fn paged_block_indices_mut(layer: &mut Self::Layer) -> &mut Vec<u32>
Source§fn paged_write(
ctx: &mut B::Context,
layer: &mut Self::Layer,
qkv: &B::Buffer,
q_norm_w: &B::Buffer,
k_norm_w: &B::Buffer,
cos: &B::Buffer,
sin: &B::Buffer,
q_out: &mut B::Buffer,
k_scratch: &mut B::Buffer,
v_scratch: &mut B::Buffer,
_pool_k: &mut B::Buffer,
_pool_v: &mut B::Buffer,
tokens: usize,
num_q_heads: usize,
num_kv_heads: usize,
head_dim: usize,
pos_offset: usize,
eps: f32,
qk_mode: i32,
) -> Result<()>
fn paged_write( ctx: &mut B::Context, layer: &mut Self::Layer, qkv: &B::Buffer, q_norm_w: &B::Buffer, k_norm_w: &B::Buffer, cos: &B::Buffer, sin: &B::Buffer, q_out: &mut B::Buffer, k_scratch: &mut B::Buffer, v_scratch: &mut B::Buffer, _pool_k: &mut B::Buffer, _pool_v: &mut B::Buffer, tokens: usize, num_q_heads: usize, num_kv_heads: usize, head_dim: usize, pos_offset: usize, eps: f32, qk_mode: i32, ) -> Result<()>
Paged write: split QKV → norm → RoPE → write K/V into the paged
pool. FP16 uses
B::split_qkv_norm_rope_into_paged_cache. INT8
uses B::split_qkv_norm_rope + B::int8_kv_append_paged.Source§fn paged_decode_attention(
ctx: &mut B::Context,
layer: &mut Self::Layer,
q: &B::Buffer,
_pool_k: &B::Buffer,
_pool_v: &B::Buffer,
output: &mut B::Buffer,
num_q_heads: usize,
num_kv_heads: usize,
head_dim: usize,
final_kv_len: usize,
_tokens: usize,
) -> Result<()>
fn paged_decode_attention( ctx: &mut B::Context, layer: &mut Self::Layer, q: &B::Buffer, _pool_k: &B::Buffer, _pool_v: &B::Buffer, output: &mut B::Buffer, num_q_heads: usize, num_kv_heads: usize, head_dim: usize, final_kv_len: usize, _tokens: usize, ) -> Result<()>
Paged decode attention. Reads from the per-layer cache, writes the
attended output to
output. FP16 reads from pool_k/pool_v;
INT8 reads from layer-internal INT8 buffers (pool args ignored).fn is_paged(layer: &Self::Layer) -> bool
Source§fn contig_write(
_ctx: &mut B::Context,
_layer: &mut Self::Layer,
_qkv: &B::Buffer,
_q_norm_w: &B::Buffer,
_k_norm_w: &B::Buffer,
_cos: &B::Buffer,
_sin: &B::Buffer,
_q_out: &mut B::Buffer,
_k_scratch: &mut B::Buffer,
_v_scratch: &mut B::Buffer,
_q_buf: &mut B::Buffer,
_k_buf: &mut B::Buffer,
_v_buf: &mut B::Buffer,
_tokens: usize,
_num_q_heads: usize,
_num_kv_heads: usize,
_head_dim: usize,
_pos_offset: usize,
_eps: f32,
_qk_mode: i32,
) -> Result<()>
fn contig_write( _ctx: &mut B::Context, _layer: &mut Self::Layer, _qkv: &B::Buffer, _q_norm_w: &B::Buffer, _k_norm_w: &B::Buffer, _cos: &B::Buffer, _sin: &B::Buffer, _q_out: &mut B::Buffer, _k_scratch: &mut B::Buffer, _v_scratch: &mut B::Buffer, _q_buf: &mut B::Buffer, _k_buf: &mut B::Buffer, _v_buf: &mut B::Buffer, _tokens: usize, _num_q_heads: usize, _num_kv_heads: usize, _head_dim: usize, _pos_offset: usize, _eps: f32, _qk_mode: i32, ) -> Result<()>
Contig write: FP16 only. INT8 inherits the panic default —
KvInt8::alloc_contig panics in ensure_kv, so this branch is
dead code on the INT8 path.Auto Trait Implementations§
impl Freeze for KvInt8
impl RefUnwindSafe for KvInt8
impl Send for KvInt8
impl Sync for KvInt8
impl Unpin for KvInt8
impl UnsafeUnpin for KvInt8
impl UnwindSafe for KvInt8
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more