turbo-quant 0.2.0

Experimental vector compression sidecars with PolarQuant, TurboQuant, QJL sketches, wire formats, and benchmark receipts
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
//! KV cache compression for transformer attention.
//!
//! In a decoder-only transformer, the KV cache stores one key/value pair per
//! token per layer. With long context windows this dominates GPU memory.
//!
//! [`KvCacheCompressor`] provides an experimental online shadow-mode interface:
//! call [`KvCacheCompressor::compress_token`] as each token is generated, then
//! [`KvCacheCompressor::attention_scores`] to compute attention logits from the
//! selected key policy.
//!
//! Exact shadows can be retained for fallback and comparison. This module does
//! not validate deployed attention quality.
//!
//! # Usage
//!
//! ```rust
//! use turbo_quant::kv::{KvCacheCompressor, KvRuntimeConfig, KvQuantPolicy};
//!
//! let config = KvRuntimeConfig {
//!     head_dim: 64,        // per-attention-head dimension
//!     key_policy: KvQuantPolicy::quantized(8, 16),
//!     value_policy: KvQuantPolicy::Exact,
//!     seed: 42,
//!     keep_exact_shadow: true,
//! };
//!
//! let mut cache = KvCacheCompressor::new_runtime(config).unwrap();
//!
//! // At each generation step, compress the new key and value.
//! let key = vec![0.1f32; 64];
//! let value = vec![0.2f32; 64];
//! cache.compress_token(&key, &value).unwrap();
//!
//! // At attention time: compute logits against all compressed keys.
//! let query = vec![0.15f32; 64];
//! let scores = cache.attention_scores(&query).unwrap();
//! assert_eq!(scores.len(), 1); // one score per stored token
//!
//! // Retrieve approximate reconstructed values for the top-k tokens.
//! let values = cache.decode_values(&[0]).unwrap();
//! assert_eq!(values.len(), 1);
//! ```

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

use crate::{
    error::{Result, TurboQuantError},
    rotation::RotationKind,
    turbo::{TurboCode, TurboMode, TurboQuantizer},
};

/// Quantization policy for one KV stream.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum KvQuantPolicy {
    /// Keep exact f32 vectors and do not create a compressed sidecar.
    Exact,
    /// Create TurboQuant sidecars. Callers should benchmark before promoting.
    Quantized {
        bits: u8,
        projections: usize,
        mode: TurboMode,
        rotation_kind: RotationKind,
    },
}

impl KvQuantPolicy {
    pub fn quantized(bits: u8, projections: usize) -> Self {
        Self::Quantized {
            bits,
            projections,
            mode: TurboMode::PolarWithQjl,
            rotation_kind: RotationKind::Auto,
        }
    }

    pub fn quantized_with_stored_rotation(bits: u8, projections: usize) -> Self {
        Self::Quantized {
            bits,
            projections,
            mode: TurboMode::PolarWithQjl,
            rotation_kind: RotationKind::StoredQr,
        }
    }
}

/// Legacy configuration for a single attention head's KV cache compressor.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct KvCacheConfig {
    /// Dimension of each attention head's key/value vectors.
    pub head_dim: usize,
    /// Legacy symmetric TurboQuant bit budget.
    pub bits: u8,
    /// Legacy symmetric QJL projection count.
    pub projections: usize,
    /// Deterministic seed for all random matrices.
    pub seed: u64,
}

/// Runtime configuration for shadow-mode KV experiments.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct KvRuntimeConfig {
    /// Dimension of each attention head's key/value vectors.
    pub head_dim: usize,
    /// Key compression policy.
    pub key_policy: KvQuantPolicy,
    /// Value compression policy. Values may need a different policy from keys.
    pub value_policy: KvQuantPolicy,
    /// Deterministic seed for all random matrices.
    pub seed: u64,
    /// Keep exact keys/values so callers can run fallback and shadow comparison.
    pub keep_exact_shadow: bool,
}

impl From<KvCacheConfig> for KvRuntimeConfig {
    fn from(config: KvCacheConfig) -> Self {
        Self {
            head_dim: config.head_dim,
            key_policy: KvQuantPolicy::quantized(config.bits, config.projections),
            value_policy: KvQuantPolicy::quantized(config.bits, config.projections),
            seed: config.seed,
            keep_exact_shadow: true,
        }
    }
}

/// Legacy compressed KV cache entry for one token.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct CompressedToken {
    pub compressed_key: TurboCode,
    pub compressed_value: TurboCode,
}

/// Runtime shadow entry for one token.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct KvShadowToken {
    pub compressed_key: Option<TurboCode>,
    pub compressed_value: Option<TurboCode>,
    pub exact_key: Option<Vec<f32>>,
    pub exact_value: Option<Vec<f32>>,
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct KvShadowScore {
    pub exact: f32,
    pub compressed: f32,
    pub abs_error: f32,
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, JsonSchema)]
pub enum AttentionScale {
    None,
    ByHeadDim,
    Custom(f32),
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct AttentionScoreOptions {
    pub scale: AttentionScale,
}

impl Default for AttentionScoreOptions {
    fn default() -> Self {
        Self {
            scale: AttentionScale::None,
        }
    }
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct KvMemoryReportV1 {
    pub schema: String,
    pub token_count: usize,
    pub head_dim: usize,
    pub compressed_key_bytes: usize,
    pub compressed_value_bytes: usize,
    pub exact_shadow_key_bytes: usize,
    pub exact_shadow_value_bytes: usize,
    pub raw_fp32_baseline_bytes: usize,
    pub fp16_baseline_bytes: usize,
    pub resident_bytes: usize,
    pub sidecar_only_bytes: usize,
    pub warnings: Vec<String>,
}

/// Online KV cache compressor for one attention head.
///
/// Tokens are appended in generation order. Attention scores are computed
/// across all stored compressed keys without decompression.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KvCacheCompressor {
    config: KvRuntimeConfig,
    key_quantizer: Option<TurboQuantizer>,
    value_quantizer: Option<TurboQuantizer>,
    tokens: Vec<KvShadowToken>,
}

impl KvCacheCompressor {
    /// Create a new compressor for one attention head.
    pub fn new(config: KvCacheConfig) -> Result<Self> {
        Self::new_runtime(config.into())
    }

    /// Create a new compressor from explicit runtime policies.
    pub fn new_runtime(config: KvRuntimeConfig) -> Result<Self> {
        let key_quantizer = quantizer_for_policy(config.head_dim, &config.key_policy, config.seed)?;
        // Value quantizer uses an independent seed offset so key and value
        // rotation matrices are uncorrelated.
        let value_quantizer = quantizer_for_policy(
            config.head_dim,
            &config.value_policy,
            config.seed.wrapping_add(0x1234_5678_ABCD_EF00),
        )?;
        Ok(Self {
            config,
            key_quantizer,
            value_quantizer,
            tokens: Vec::new(),
        })
    }

    /// The number of tokens currently stored in this cache.
    pub fn len(&self) -> usize {
        self.tokens.len()
    }

    /// True if the cache is empty.
    pub fn is_empty(&self) -> bool {
        self.tokens.is_empty()
    }

    /// Compress and store one (key, value) token pair.
    ///
    /// Called once per generation step. O(d) time.
    pub fn compress_token(&mut self, key: &[f32], value: &[f32]) -> Result<()> {
        if key.len() != self.config.head_dim {
            return Err(TurboQuantError::DimensionMismatch {
                expected: self.config.head_dim,
                got: key.len(),
            });
        }
        if value.len() != self.config.head_dim {
            return Err(TurboQuantError::DimensionMismatch {
                expected: self.config.head_dim,
                got: value.len(),
            });
        }

        let compressed_key = self
            .key_quantizer
            .as_ref()
            .map(|quantizer| quantizer.encode(key))
            .transpose()?;
        let compressed_value = self
            .value_quantizer
            .as_ref()
            .map(|quantizer| quantizer.encode(value))
            .transpose()?;
        self.tokens.push(KvShadowToken {
            compressed_key,
            compressed_value,
            exact_key: self.config.keep_exact_shadow.then(|| key.to_vec()),
            exact_value: self.config.keep_exact_shadow.then(|| value.to_vec()),
        });
        Ok(())
    }

    /// Compute attention logits q·kᵢ for all stored tokens i.
    ///
    /// Returns a vector of length `self.len()`. Does not decompress keys.
    /// Suitable for softmax → attention weight computation.
    ///
    /// O(n·d) time where n is the number of stored tokens.
    pub fn attention_scores(&self, query: &[f32]) -> Result<Vec<f32>> {
        self.attention_scores_with_options(query, AttentionScoreOptions::default())
    }

    /// Compute attention logits with optional transformer-style scaling.
    pub fn attention_scores_with_options(
        &self,
        query: &[f32],
        options: AttentionScoreOptions,
    ) -> Result<Vec<f32>> {
        if query.len() != self.config.head_dim {
            return Err(TurboQuantError::DimensionMismatch {
                expected: self.config.head_dim,
                got: query.len(),
            });
        }

        let mut scores: Vec<f32> = match &self.key_quantizer {
            Some(quantizer) => self
                .tokens
                .iter()
                .map(|t| {
                    let code = t.compressed_key.as_ref().ok_or_else(|| {
                        TurboQuantError::MalformedCode {
                            reason: "token is missing compressed key".into(),
                        }
                    })?;
                    quantizer.inner_product_estimate(code, query)
                })
                .collect(),
            None => self.exact_attention_scores(query),
        }?;
        let scale = match options.scale {
            AttentionScale::None => 1.0,
            AttentionScale::ByHeadDim => 1.0 / (self.config.head_dim as f32).sqrt(),
            AttentionScale::Custom(value) => value,
        };
        if !scale.is_finite() {
            return Err(TurboQuantError::MalformedCode {
                reason: "attention scale is not finite".into(),
            });
        }
        for score in &mut scores {
            *score *= scale;
        }
        Ok(scores)
    }

    /// Exact f32 attention logits from the retained shadow vectors.
    pub fn exact_attention_scores(&self, query: &[f32]) -> Result<Vec<f32>> {
        if query.len() != self.config.head_dim {
            return Err(TurboQuantError::DimensionMismatch {
                expected: self.config.head_dim,
                got: query.len(),
            });
        }
        self.tokens
            .iter()
            .map(|token| {
                let key =
                    token
                        .exact_key
                        .as_ref()
                        .ok_or_else(|| TurboQuantError::MalformedCode {
                            reason: "exact key fallback unavailable; enable keep_exact_shadow"
                                .into(),
                        })?;
                Ok(key.iter().zip(query.iter()).map(|(k, q)| k * q).sum())
            })
            .collect()
    }

    /// Compare exact and compressed key scores when exact shadows are retained.
    pub fn shadow_attention_scores(&self, query: &[f32]) -> Result<Vec<KvShadowScore>> {
        let exact = self.exact_attention_scores(query)?;
        let compressed = match &self.key_quantizer {
            Some(_) => self.attention_scores(query)?,
            None => exact.clone(),
        };
        Ok(exact
            .into_iter()
            .zip(compressed)
            .map(|(exact, compressed)| KvShadowScore {
                exact,
                compressed,
                abs_error: (exact - compressed).abs(),
            })
            .collect())
    }

    /// Softmax-weighted sum of approximate value reconstructions.
    ///
    /// This is the full compressed attention output for one head:
    /// `Σᵢ softmax(scores)ᵢ · decode(vᵢ)`
    ///
    /// Note: values are decoded (approximate reconstruction) rather than
    /// stored in compressed form, since weighted sums cannot be computed
    /// in compressed space. This is consistent with how KV cache compression
    /// is used in practice (compress keys for score computation; decode values
    /// for the weighted sum).
    pub fn attend(&self, query: &[f32]) -> Result<Vec<f32>> {
        if self.tokens.is_empty() {
            return Ok(vec![0.0f32; self.config.head_dim]);
        }

        let scores = self.attention_scores(query)?;

        // Numerically stable softmax.
        let max_score = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
        let exps: Vec<f32> = scores.iter().map(|s| (s - max_score).exp()).collect();
        let sum_exp: f32 = exps.iter().sum();
        let weights: Vec<f32> = exps.iter().map(|e| e / sum_exp).collect();

        // Weighted sum of decoded values.
        let mut output = vec![0.0f32; self.config.head_dim];
        for (token, &weight) in self.tokens.iter().zip(weights.iter()) {
            let decoded = match (
                &self.value_quantizer,
                &token.compressed_value,
                &token.exact_value,
            ) {
                (Some(quantizer), Some(code), _) => quantizer.decode_approximate(code)?,
                (None, _, Some(exact)) => exact.clone(),
                _ => {
                    return Err(TurboQuantError::MalformedCode {
                        reason: "value fallback unavailable".into(),
                    });
                }
            };
            for (out, val) in output.iter_mut().zip(decoded.iter()) {
                *out += weight * val;
            }
        }

        Ok(output)
    }

    /// Decode the value vector for specific token indices (for top-k attention).
    pub fn decode_values(&self, indices: &[usize]) -> Result<Vec<Vec<f32>>> {
        indices
            .iter()
            .map(|&i| {
                if i >= self.tokens.len() {
                    return Err(TurboQuantError::DimensionMismatch {
                        expected: self.tokens.len(),
                        got: i + 1,
                    });
                }
                match (
                    &self.value_quantizer,
                    &self.tokens[i].compressed_value,
                    &self.tokens[i].exact_value,
                ) {
                    (Some(quantizer), Some(code), _) => quantizer.decode_approximate(code),
                    (None, _, Some(exact)) => Ok(exact.clone()),
                    _ => Err(TurboQuantError::MalformedCode {
                        reason: "value fallback unavailable".into(),
                    }),
                }
            })
            .collect()
    }

    /// Approximate total bytes used by all compressed tokens.
    pub fn compressed_bytes(&self) -> usize {
        self.tokens
            .iter()
            .map(|t| {
                t.compressed_key
                    .as_ref()
                    .map_or(0, TurboCode::encoded_bytes)
                    + t.compressed_value
                        .as_ref()
                        .map_or(0, TurboCode::encoded_bytes)
            })
            .sum()
    }

    /// Bytes that would be used if keys/values were stored uncompressed as f32.
    pub fn uncompressed_bytes(&self) -> usize {
        self.tokens.len() * 2 * self.config.head_dim * std::mem::size_of::<f32>()
    }

    /// Compression ratio: uncompressed / compressed.
    pub fn compression_ratio(&self) -> f32 {
        let compressed = self.compressed_bytes();
        if compressed == 0 {
            return 0.0;
        }
        self.uncompressed_bytes() as f32 / compressed as f32
    }

    /// Honest memory accounting for compressed sidecars plus retained shadows.
    pub fn memory_report(&self) -> KvMemoryReportV1 {
        let compressed_key_bytes: usize = self
            .tokens
            .iter()
            .map(|token| {
                token
                    .compressed_key
                    .as_ref()
                    .map_or(0, TurboCode::encoded_bytes)
            })
            .sum();
        let compressed_value_bytes: usize = self
            .tokens
            .iter()
            .map(|token| {
                token
                    .compressed_value
                    .as_ref()
                    .map_or(0, TurboCode::encoded_bytes)
            })
            .sum();
        let exact_shadow_key_bytes: usize = self
            .tokens
            .iter()
            .map(|token| token.exact_key.as_ref().map_or(0, |v| v.len() * 4))
            .sum();
        let exact_shadow_value_bytes: usize = self
            .tokens
            .iter()
            .map(|token| token.exact_value.as_ref().map_or(0, |v| v.len() * 4))
            .sum();
        let raw_fp32_baseline_bytes = self.uncompressed_bytes();
        let fp16_baseline_bytes = self.tokens.len() * 2 * self.config.head_dim * 2;
        let sidecar_only_bytes = compressed_key_bytes + compressed_value_bytes;
        let resident_bytes = sidecar_only_bytes + exact_shadow_key_bytes + exact_shadow_value_bytes;
        KvMemoryReportV1 {
            schema: "KvMemoryReportV1".into(),
            token_count: self.tokens.len(),
            head_dim: self.config.head_dim,
            compressed_key_bytes,
            compressed_value_bytes,
            exact_shadow_key_bytes,
            exact_shadow_value_bytes,
            raw_fp32_baseline_bytes,
            fp16_baseline_bytes,
            resident_bytes,
            sidecar_only_bytes,
            warnings: vec![
                "resident bytes include retained exact shadows; do not report sidecar-only bytes as runtime savings".into(),
            ],
        }
    }
}

fn quantizer_for_policy(
    head_dim: usize,
    policy: &KvQuantPolicy,
    seed: u64,
) -> Result<Option<TurboQuantizer>> {
    match policy {
        KvQuantPolicy::Exact => Ok(None),
        KvQuantPolicy::Quantized {
            bits,
            projections,
            mode,
            rotation_kind,
        } => TurboQuantizer::new_with_mode_and_rotation(
            head_dim,
            *bits,
            *projections,
            seed,
            *mode,
            *rotation_kind,
        )
        .map(Some),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn random_vec(dim: usize, seed: u64) -> Vec<f32> {
        use rand::SeedableRng;
        use rand_chacha::ChaCha8Rng;
        use rand_distr::{Distribution, StandardNormal};
        let mut rng = ChaCha8Rng::seed_from_u64(seed);
        (0..dim).map(|_| StandardNormal.sample(&mut rng)).collect()
    }

    fn make_cache(dim: usize) -> KvCacheCompressor {
        KvCacheCompressor::new(KvCacheConfig {
            head_dim: dim,
            bits: 8,
            projections: dim / 4,
            seed: 42,
        })
        .unwrap()
    }

    #[test]
    fn empty_cache_returns_empty_scores() {
        let cache = make_cache(16);
        let scores = cache.attention_scores(&random_vec(16, 1)).unwrap();
        assert!(scores.is_empty());
    }

    #[test]
    fn token_count_increments_correctly() {
        let mut cache = make_cache(16);
        for i in 0..5 {
            cache
                .compress_token(&random_vec(16, i), &random_vec(16, i + 100))
                .unwrap();
        }
        assert_eq!(cache.len(), 5);
    }

    #[test]
    fn attention_scores_length_matches_token_count() {
        let mut cache = make_cache(16);
        for i in 0..10 {
            cache
                .compress_token(&random_vec(16, i), &random_vec(16, i + 100))
                .unwrap();
        }
        let scores = cache.attention_scores(&random_vec(16, 999)).unwrap();
        assert_eq!(scores.len(), 10);
    }

    #[test]
    fn highest_score_is_for_query_similar_key() {
        let dim = 16;
        let mut cache = make_cache(dim);

        // Add 9 random keys.
        for i in 0..9u64 {
            cache
                .compress_token(&random_vec(dim, i * 10), &random_vec(dim, i * 10 + 1))
                .unwrap();
        }

        // Add one key that is very close to the query.
        let query = random_vec(dim, 999);
        let similar_key: Vec<f32> = query.iter().map(|x| x + 0.001).collect();
        cache
            .compress_token(&similar_key, &random_vec(dim, 9000))
            .unwrap();

        let scores = cache.attention_scores(&query).unwrap();
        let best_idx = scores
            .iter()
            .enumerate()
            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
            .map(|(i, _)| i)
            .unwrap();

        assert_eq!(
            best_idx, 9,
            "similar key should have highest attention score"
        );
    }

    #[test]
    fn attend_output_has_correct_dimension() {
        let dim = 16;
        let mut cache = make_cache(dim);
        for i in 0..5u64 {
            cache
                .compress_token(&random_vec(dim, i), &random_vec(dim, i + 50))
                .unwrap();
        }
        let output = cache.attend(&random_vec(dim, 1)).unwrap();
        assert_eq!(output.len(), dim);
    }

    #[test]
    fn attend_weights_sum_to_one_implicitly() {
        // If all values are the same vector v, attend() should return ≈ v.
        let dim = 16;
        let mut cache = make_cache(dim);
        let v = random_vec(dim, 77);

        for i in 0..8u64 {
            cache.compress_token(&random_vec(dim, i), &v).unwrap();
        }

        let output = cache.attend(&random_vec(dim, 1)).unwrap();

        // Output should be close to v since all values are the same.
        let error: f32 = output
            .iter()
            .zip(v.iter())
            .map(|(o, vi)| (o - vi).powi(2))
            .sum::<f32>()
            .sqrt();
        let v_norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            error / v_norm < 0.15,
            "attend output too far from value: relative_error={:.4}",
            error / v_norm
        );
    }

    #[test]
    fn compression_ratio_is_above_one() {
        let mut cache = make_cache(64);
        for i in 0..20u64 {
            cache
                .compress_token(&random_vec(64, i), &random_vec(64, i + 1000))
                .unwrap();
        }
        let ratio = cache.compression_ratio();
        assert!(
            ratio > 1.0,
            "compression ratio should be > 1, got {ratio:.2}"
        );
        println!("compression ratio at 8 bits, d=64: {ratio:.2}x");
    }

    #[test]
    fn wrong_key_dimension_is_rejected() {
        let mut cache = make_cache(16);
        let result = cache.compress_token(&random_vec(8, 0), &random_vec(16, 1));
        assert!(result.is_err());
    }

    #[test]
    fn wrong_query_dimension_is_rejected() {
        let mut cache = make_cache(16);
        cache
            .compress_token(&random_vec(16, 0), &random_vec(16, 1))
            .unwrap();
        let result = cache.attention_scores(&random_vec(8, 0));
        assert!(result.is_err());
    }

    #[test]
    fn key_and_value_quantizers_are_independent() {
        // Encode the same vector as both key and value.
        // The codes should differ because the quantizers use different seeds.
        let mut cache = make_cache(16);
        let v = random_vec(16, 42);
        cache.compress_token(&v, &v).unwrap();

        let token = &cache.tokens[0];
        // Different rotation seeds → different angle indices with overwhelming probability.
        assert_ne!(
            token
                .compressed_key
                .as_ref()
                .unwrap()
                .polar_code
                .angle_indices,
            token
                .compressed_value
                .as_ref()
                .unwrap()
                .polar_code
                .angle_indices,
            "key and value quantizers should use independent rotations"
        );
    }

    #[test]
    fn exact_shadow_scores_are_available() {
        let mut cache = make_cache(16);
        cache
            .compress_token(&random_vec(16, 1), &random_vec(16, 2))
            .unwrap();
        let shadow = cache.shadow_attention_scores(&random_vec(16, 3)).unwrap();
        assert_eq!(shadow.len(), 1);
        assert!(shadow[0].abs_error.is_finite());
    }

    #[test]
    fn asymmetric_exact_value_policy_decodes_exact_values() {
        let dim = 16;
        let mut cache = KvCacheCompressor::new_runtime(KvRuntimeConfig {
            head_dim: dim,
            key_policy: KvQuantPolicy::quantized(8, dim / 4),
            value_policy: KvQuantPolicy::Exact,
            seed: 42,
            keep_exact_shadow: true,
        })
        .unwrap();
        let value = random_vec(dim, 99);
        cache.compress_token(&random_vec(dim, 1), &value).unwrap();
        assert_eq!(cache.decode_values(&[0]).unwrap()[0], value);
    }
}