ferrum_interfaces/
kv_dtype.rs

1//! KV cache element-type markers (Dim 5 of the 5-dimension architecture).
2//!
3//! These are pure marker types with no GPU dependencies, so they live
4//! in `ferrum-interfaces` rather than `ferrum-kernels`. The capability
5//! trait that links them to a backend (`BackendKvDtype<K>: BackendPagedKv`)
6//! does need GPU types, so it stays in `ferrum-kernels::backend`.
7//!
8//! Each model's KV cache has its own precision independent of the
9//! model's compute precision. vLLM 0.6+ ships INT8 / FP8 KV caches that
10//! halve KV memory at small (<1%) accuracy hit. ferrum's type system
11//! exposes this axis via the `K: KvDtypeKind` parameter on
12//! `KvCache<B, K>` (default `K = KvFp16`).
13
14/// Marker trait + metadata for a KV cache element type.
15pub trait KvDtypeKind: Send + Sync + 'static {
16    /// Stable name for logging / debug (e.g. "fp16", "int8").
17    const NAME: &'static str;
18    /// Bytes per element on disk + in cache memory.
19    const BYTES_PER_ELEM: usize;
20}
21
22/// FP16 KV cache (the existing default on CUDA + Metal).
23pub struct KvFp16;
24impl KvDtypeKind for KvFp16 {
25    const NAME: &'static str = "fp16";
26    const BYTES_PER_ELEM: usize = 2;
27}
28
29/// BF16 KV cache (drop-in replacement for FP16 on Ampere+ / Apple Silicon).
30pub struct KvBf16;
31impl KvDtypeKind for KvBf16 {
32    const NAME: &'static str = "bf16";
33    const BYTES_PER_ELEM: usize = 2;
34}
35
36/// INT8 KV cache — half the memory of FP16 with per-token / per-channel
37/// scale factors. CUDA path planned via vLLM's quant_kv kernels.
38pub struct KvInt8;
39impl KvDtypeKind for KvInt8 {
40    const NAME: &'static str = "int8";
41    const BYTES_PER_ELEM: usize = 1;
42}
43
44/// FP8 KV cache — E4M3 by default. Hopper+ on CUDA, future on Metal.
45pub struct KvFp8;
46impl KvDtypeKind for KvFp8 {
47    const NAME: &'static str = "fp8";
48    const BYTES_PER_ELEM: usize = 1;
49}
ferrum_interfaces/kv_dtype.rs

ferrum_interfaces/
kv_dtype.rs