pub struct QuantizedKVCache {
pub k_data: Vec<Vec<u8>>,
pub v_data: Vec<Vec<u8>>,
pub k_scales: Vec<Vec<f32>>,
pub v_scales: Vec<Vec<f32>>,
pub format: KVCacheFormat,
pub seq_len: usize,
pub max_seq_len: usize,
pub num_kv_heads: usize,
pub head_dim: usize,
pub num_layers: usize,
}Expand description
Quantized KV cache that stores K/V in reduced precision
Fields§
§k_data: Vec<Vec<u8>>Quantized key data per layer - raw bytes in the chosen format
v_data: Vec<Vec<u8>>Quantized value data per layer
k_scales: Vec<Vec<f32>>Per-head scale factors for INT8 (one per head per position per layer) Layout: [layer][head * max_seq_len + pos]
v_scales: Vec<Vec<f32>>§format: KVCacheFormatStorage format
seq_len: usizeCurrent sequence length
max_seq_len: usize§num_kv_heads: usize§head_dim: usize§num_layers: usizeImplementations§
Source§impl QuantizedKVCache
impl QuantizedKVCache
Sourcepub fn new(
num_layers: usize,
num_kv_heads: usize,
max_seq_len: usize,
head_dim: usize,
format: KVCacheFormat,
) -> Self
pub fn new( num_layers: usize, num_kv_heads: usize, max_seq_len: usize, head_dim: usize, format: KVCacheFormat, ) -> Self
Create a new quantized KV cache with the given dimensions and format
Sourcepub fn remaining_capacity(&self) -> usize
pub fn remaining_capacity(&self) -> usize
Get remaining capacity
Sourcepub fn memory_usage(&self) -> usize
pub fn memory_usage(&self) -> usize
Get memory usage in bytes
Sourcepub fn write_kv(
&mut self,
layer: usize,
pos: usize,
k_data: &[f32],
v_data: &[f32],
)
pub fn write_kv( &mut self, layer: usize, pos: usize, k_data: &[f32], v_data: &[f32], )
Write quantized K/V for one position
k_data and v_data are [num_kv_heads * head_dim] each
Sourcepub fn read_k(&self, layer: usize, head: usize, pos: usize) -> Vec<f32>
pub fn read_k(&self, layer: usize, head: usize, pos: usize) -> Vec<f32>
Dequantize and return key for one head at one position
Sourcepub fn read_v(&self, layer: usize, head: usize, pos: usize) -> Vec<f32>
pub fn read_v(&self, layer: usize, head: usize, pos: usize) -> Vec<f32>
Dequantize and return value for one head at one position
Sourcepub fn read_k_range(
&self,
layer: usize,
head: usize,
start_pos: usize,
end_pos: usize,
) -> Vec<f32>
pub fn read_k_range( &self, layer: usize, head: usize, start_pos: usize, end_pos: usize, ) -> Vec<f32>
Dequantize key range for one head
Returns [end_pos - start_pos, head_dim] as flat vec
Sourcepub fn read_v_range(
&self,
layer: usize,
head: usize,
start_pos: usize,
end_pos: usize,
) -> Vec<f32>
pub fn read_v_range( &self, layer: usize, head: usize, start_pos: usize, end_pos: usize, ) -> Vec<f32>
Dequantize value range for one head
Returns [end_pos - start_pos, head_dim] as flat vec
Sourcepub fn shift_left(&mut self, amount: usize)
pub fn shift_left(&mut self, amount: usize)
Shift cache left by amount positions (for sliding window)
Auto Trait Implementations§
impl Freeze for QuantizedKVCache
impl RefUnwindSafe for QuantizedKVCache
impl Send for QuantizedKVCache
impl Sync for QuantizedKVCache
impl Unpin for QuantizedKVCache
impl UnsafeUnpin for QuantizedKVCache
impl UnwindSafe for QuantizedKVCache
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
impl<A, B, T> HttpServerConnExec<A, B> for Twhere
B: Body,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more