pub struct CompressedKVCache<B: Backend> { /* private fields */ }Expand description
Compressed KV cache compatible with paged attention APIs.
Implementations§
Source§impl<B: Backend> CompressedKVCache<B>
impl<B: Backend> CompressedKVCache<B>
Sourcepub fn new(
max_blocks: usize,
num_layers: usize,
num_heads: usize,
mla: MultiHeadLatentAttention<B>,
device: &B::Device,
) -> Self
pub fn new( max_blocks: usize, num_layers: usize, num_heads: usize, mla: MultiHeadLatentAttention<B>, device: &B::Device, ) -> Self
Create a compressed KV cache for a given MLA configuration.
Sourcepub fn allocate_sequence(&mut self) -> usize
pub fn allocate_sequence(&mut self) -> usize
Allocate a new sequence and return its id.
Sourcepub fn append(
&mut self,
layer: usize,
seq_id: usize,
keys: Tensor<B, 3>,
values: Tensor<B, 3>,
) -> Result<(), &'static str>
pub fn append( &mut self, layer: usize, seq_id: usize, keys: Tensor<B, 3>, values: Tensor<B, 3>, ) -> Result<(), &'static str>
Append uncompressed KV tensors (3D) into the compressed cache.
Sourcepub fn append_batched(
&mut self,
layer: usize,
seq_id: usize,
keys: Tensor<B, 4>,
values: Tensor<B, 4>,
) -> Result<(), &'static str>
pub fn append_batched( &mut self, layer: usize, seq_id: usize, keys: Tensor<B, 4>, values: Tensor<B, 4>, ) -> Result<(), &'static str>
Append uncompressed KV tensors (4D, batch=1) into the compressed cache.
Sourcepub fn append_compressed(
&mut self,
layer: usize,
seq_id: usize,
keys_latent: Tensor<B, 3>,
values_latent: Tensor<B, 3>,
) -> Result<(), &'static str>
pub fn append_compressed( &mut self, layer: usize, seq_id: usize, keys_latent: Tensor<B, 3>, values_latent: Tensor<B, 3>, ) -> Result<(), &'static str>
Append pre-compressed KV tensors directly.
Sourcepub fn get_kv(
&self,
layer: usize,
seq_id: usize,
) -> Result<(Tensor<B, 3>, Tensor<B, 3>), &'static str>
pub fn get_kv( &self, layer: usize, seq_id: usize, ) -> Result<(Tensor<B, 3>, Tensor<B, 3>), &'static str>
Get decompressed KV tensors for a layer/sequence.
Sourcepub fn get_compressed_kv(
&self,
layer: usize,
seq_id: usize,
) -> Result<(Tensor<B, 3>, Tensor<B, 3>), &'static str>
pub fn get_compressed_kv( &self, layer: usize, seq_id: usize, ) -> Result<(Tensor<B, 3>, Tensor<B, 3>), &'static str>
Get compressed KV tensors for a layer/sequence.
Sourcepub fn iter_kv_blocks(
&self,
layer: usize,
seq_id: usize,
) -> Result<Vec<(Tensor<B, 3>, Tensor<B, 3>)>, &'static str>
pub fn iter_kv_blocks( &self, layer: usize, seq_id: usize, ) -> Result<Vec<(Tensor<B, 3>, Tensor<B, 3>)>, &'static str>
Iterate over decompressed KV blocks for a sequence.
Sourcepub fn iter_compressed_blocks(
&self,
layer: usize,
seq_id: usize,
) -> Result<Vec<(Tensor<B, 3>, Tensor<B, 3>)>, &'static str>
pub fn iter_compressed_blocks( &self, layer: usize, seq_id: usize, ) -> Result<Vec<(Tensor<B, 3>, Tensor<B, 3>)>, &'static str>
Iterate over compressed KV blocks for a sequence.
Sourcepub fn seq_len(
&self,
layer: usize,
seq_id: usize,
) -> Result<usize, &'static str>
pub fn seq_len( &self, layer: usize, seq_id: usize, ) -> Result<usize, &'static str>
Get sequence length for a layer/sequence.
Sourcepub fn num_free_blocks(&self) -> usize
pub fn num_free_blocks(&self) -> usize
Get the number of free blocks in the cache.
pub fn latent_dim(&self) -> usize
pub fn device(&self) -> &B::Device
Trait Implementations§
Source§impl<B: Clone + Backend> Clone for CompressedKVCache<B>
impl<B: Clone + Backend> Clone for CompressedKVCache<B>
Source§fn clone(&self) -> CompressedKVCache<B>
fn clone(&self) -> CompressedKVCache<B>
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreAuto Trait Implementations§
impl<B> Freeze for CompressedKVCache<B>where
<B as Backend>::Device: Freeze,
<B as Backend>::FloatTensorPrimitive: Freeze,
<B as Backend>::QuantizedTensorPrimitive: Freeze,
impl<B> RefUnwindSafe for CompressedKVCache<B>where
<B as Backend>::Device: RefUnwindSafe,
<B as Backend>::FloatTensorPrimitive: RefUnwindSafe,
<B as Backend>::QuantizedTensorPrimitive: RefUnwindSafe,
impl<B> Send for CompressedKVCache<B>
impl<B> Sync for CompressedKVCache<B>
impl<B> Unpin for CompressedKVCache<B>where
<B as Backend>::Device: Unpin,
<B as Backend>::FloatTensorPrimitive: Unpin,
<B as Backend>::QuantizedTensorPrimitive: Unpin,
impl<B> UnwindSafe for CompressedKVCache<B>where
<B as Backend>::Device: UnwindSafe,
<B as Backend>::FloatTensorPrimitive: UnwindSafe,
<B as Backend>::QuantizedTensorPrimitive: UnwindSafe,
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more