realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
//! Streaming KV Cache (PMAT-802)
//!
//! Memory-efficient key-value cache for transformer inference.

// ============================================================================
// M6: Memory Efficiency - StreamingKVCache
// ============================================================================

/// Streaming KV cache for memory-efficient inference
///
/// Implements a bounded circular buffer for key-value cache that allows
/// efficient inference on long sequences without unbounded memory growth.
///
/// ## Memory Bound
///
/// Total memory = num_layers * max_positions * num_heads * head_dim * 2 (K+V) * sizeof(f32)
///
/// For 7B model (32 layers, 2048 positions, 32 heads, 128 head_dim):
/// = 32 * 2048 * 32 * 128 * 2 * 4 = ~2GB
///
/// ## Usage
///
/// ```rust,ignore
/// let mut cache = StreamingKVCache::new(32, 2048, 32, 128);
/// cache.append(0, &key_vec, &value_vec);
/// let (keys, values) = cache.get_range(0, 0, 100);
/// ```
pub struct StreamingKVCache {
    /// Number of transformer layers
    num_layers: usize,
    /// Maximum cached positions (context length)
    max_positions: usize,
    /// Number of attention heads
    num_heads: usize,
    /// Dimension per head
    head_dim: usize,
    /// Key cache per layer [num_layers][max_positions * num_heads * head_dim]
    keys: Vec<Vec<f32>>,
    /// Value cache per layer
    values: Vec<Vec<f32>>,
    /// Current write position (circular)
    position: usize,
    /// Number of valid positions cached
    valid_positions: usize,
}

impl StreamingKVCache {
    /// Create a new streaming KV cache
    ///
    /// # Arguments
    ///
    /// * `num_layers` - Number of transformer layers
    /// * `max_positions` - Maximum context length to cache
    /// * `num_heads` - Number of attention heads
    /// * `head_dim` - Dimension per attention head
    #[must_use]
    pub fn new(num_layers: usize, max_positions: usize, num_heads: usize, head_dim: usize) -> Self {
        let kv_size = max_positions * num_heads * head_dim;
        Self {
            num_layers,
            max_positions,
            num_heads,
            head_dim,
            keys: vec![vec![0.0f32; kv_size]; num_layers],
            values: vec![vec![0.0f32; kv_size]; num_layers],
            position: 0,
            valid_positions: 0,
        }
    }

    /// Append key-value pair for a single position
    ///
    /// # Arguments
    ///
    /// * `layer` - Layer index (0-indexed)
    /// * `key` - Key vector [num_heads * head_dim]
    /// * `value` - Value vector [num_heads * head_dim]
    ///
    /// # Panics
    ///
    /// Panics if layer index is out of bounds or key/value dimensions are wrong.
    pub fn append(&mut self, layer: usize, key: &[f32], value: &[f32]) {
        let kv_dim = self.num_heads * self.head_dim;
        assert!(layer < self.num_layers, "Layer index out of bounds");
        assert_eq!(key.len(), kv_dim, "Key dimension mismatch");
        assert_eq!(value.len(), kv_dim, "Value dimension mismatch");

        let offset = self.position * kv_dim;
        self.keys[layer][offset..offset + kv_dim].copy_from_slice(key);
        self.values[layer][offset..offset + kv_dim].copy_from_slice(value);

        // Update position only after last layer
        if layer == self.num_layers - 1 {
            self.position = (self.position + 1) % self.max_positions;
            self.valid_positions = (self.valid_positions + 1).min(self.max_positions);
        }
    }

    /// Get keys and values for a range of positions
    ///
    /// # Arguments
    ///
    /// * `layer` - Layer index
    /// * `start` - Start position (inclusive)
    /// * `end` - End position (exclusive)
    ///
    /// # Returns
    ///
    /// Tuple of (keys, values) slices
    #[must_use]
    pub fn get_range(&self, layer: usize, start: usize, end: usize) -> (&[f32], &[f32]) {
        let kv_dim = self.num_heads * self.head_dim;
        let start_offset = start * kv_dim;
        let end_offset = end * kv_dim;

        (
            &self.keys[layer][start_offset..end_offset],
            &self.values[layer][start_offset..end_offset],
        )
    }

    /// Get all valid cached keys and values for a layer
    ///
    /// # Arguments
    ///
    /// * `layer` - Layer index
    ///
    /// # Returns
    ///
    /// Tuple of (keys, values) for all valid positions
    #[must_use]
    pub fn get_valid(&self, layer: usize) -> (&[f32], &[f32]) {
        self.get_range(layer, 0, self.valid_positions)
    }

    /// Get current number of valid cached positions
    #[must_use]
    pub fn len(&self) -> usize {
        self.valid_positions
    }

    /// Check if cache is empty
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.valid_positions == 0
    }

    /// Get maximum positions (context length)
    #[must_use]
    pub fn max_positions(&self) -> usize {
        self.max_positions
    }

    /// Reset the cache
    pub fn clear(&mut self) {
        self.position = 0;
        self.valid_positions = 0;
        // Note: We don't zero the memory for performance
    }

    /// Calculate memory usage in bytes
    #[must_use]
    pub fn memory_bytes(&self) -> usize {
        let kv_size = self.max_positions * self.num_heads * self.head_dim;
        // Keys + Values, f32 = 4 bytes
        self.num_layers * kv_size * 2 * 4
    }

    /// Calculate memory usage in megabytes
    #[must_use]
    pub fn memory_mb(&self) -> f64 {
        self.memory_bytes() as f64 / (1024.0 * 1024.0)
    }
}

/// Streaming KV cache with FP16 storage for memory efficiency (M12)
///
/// Uses half-precision (FP16) storage to halve memory usage compared to FP32,
/// enabling support for ultra-long contexts (65536+) on consumer GPUs.
///
/// # Memory Efficiency
///
/// For 65536 context with 7B model config:
/// - FP32: 32 layers × 65536 pos × 32 heads × 128 dim × 2 × 4 bytes = 68.72 GB
/// - FP16: 32 layers × 65536 pos × 32 heads × 128 dim × 2 × 2 bytes = 34.36 GB
///
/// # Example
///
/// ```
/// use realizar::gpu::StreamingKVCacheFp16;
///
/// let mut cache = StreamingKVCacheFp16::new(32, 65536, 32, 128);
/// assert!(cache.memory_mb() < 36000.0); // < 36 GB
/// ```
pub struct StreamingKVCacheFp16 {
    /// Number of transformer layers
    num_layers: usize,
    /// Maximum cached positions (context length)
    max_positions: usize,
    /// Number of attention heads
    num_heads: usize,
    /// Dimension per head
    head_dim: usize,
    /// Key cache per layer [num_layers][max_positions * num_heads * head_dim] stored as FP16 bits
    keys: Vec<Vec<u16>>,
    /// Value cache per layer stored as FP16 bits
    values: Vec<Vec<u16>>,
    /// Current write position (circular)
    position: usize,
    /// Number of valid positions cached
    valid_positions: usize,
}

impl StreamingKVCacheFp16 {
    /// Create a new FP16 streaming KV cache
    ///
    /// # Arguments
    ///
    /// * `num_layers` - Number of transformer layers
    /// * `max_positions` - Maximum context length to cache
    /// * `num_heads` - Number of attention heads
    /// * `head_dim` - Dimension per attention head
    #[must_use]
    pub fn new(num_layers: usize, max_positions: usize, num_heads: usize, head_dim: usize) -> Self {
        let kv_size = max_positions * num_heads * head_dim;
        Self {
            num_layers,
            max_positions,
            num_heads,
            head_dim,
            keys: vec![vec![0u16; kv_size]; num_layers],
            values: vec![vec![0u16; kv_size]; num_layers],
            position: 0,
            valid_positions: 0,
        }
    }

    /// Convert f32 to FP16 bits
    #[inline]
    pub(crate) fn f32_to_f16(value: f32) -> u16 {
        half::f16::from_f32(value).to_bits()
    }

    /// Convert FP16 bits to f32
    #[inline]
    pub(crate) fn f16_to_f32(bits: u16) -> f32 {
        half::f16::from_bits(bits).to_f32()
    }

    /// Append key-value pair for a single position (FP32 input, stored as FP16)
    ///
    /// # Arguments
    ///
    /// * `layer` - Layer index (0-indexed)
    /// * `key` - Key vector [num_heads * head_dim] as FP32
    /// * `value` - Value vector [num_heads * head_dim] as FP32
    ///
    /// # Panics
    ///
    /// Panics if layer index is out of bounds or key/value dimensions are wrong.
    pub fn append(&mut self, layer: usize, key: &[f32], value: &[f32]) {
        let kv_dim = self.num_heads * self.head_dim;
        assert!(layer < self.num_layers, "Layer index out of bounds");
        assert_eq!(key.len(), kv_dim, "Key dimension mismatch");
        assert_eq!(value.len(), kv_dim, "Value dimension mismatch");

        let offset = self.position * kv_dim;

        // Convert FP32 to FP16 and store
        for (i, &k) in key.iter().enumerate() {
            self.keys[layer][offset + i] = Self::f32_to_f16(k);
        }
        for (i, &v) in value.iter().enumerate() {
            self.values[layer][offset + i] = Self::f32_to_f16(v);
        }

        // Update position only after last layer
        if layer == self.num_layers - 1 {
            self.position = (self.position + 1) % self.max_positions;
            self.valid_positions = (self.valid_positions + 1).min(self.max_positions);
        }
    }

    /// Get keys and values for a range of positions (converted back to FP32)
    ///
    /// # Arguments
    ///
    /// * `layer` - Layer index
    /// * `start` - Start position (inclusive)
    /// * `end` - End position (exclusive)
    ///
    /// # Returns
    ///
    /// Tuple of (keys, values) as Vec<f32>
    #[must_use]
    pub fn get_range_f32(&self, layer: usize, start: usize, end: usize) -> (Vec<f32>, Vec<f32>) {
        let kv_dim = self.num_heads * self.head_dim;
        let start_offset = start * kv_dim;
        let end_offset = end * kv_dim;

        let keys: Vec<f32> = self.keys[layer][start_offset..end_offset]
            .iter()
            .map(|&bits| Self::f16_to_f32(bits))
            .collect();

        let values: Vec<f32> = self.values[layer][start_offset..end_offset]
            .iter()
            .map(|&bits| Self::f16_to_f32(bits))
            .collect();

        (keys, values)
    }

    /// Get raw FP16 keys and values for a range of positions
    #[must_use]
    pub fn get_range_raw(&self, layer: usize, start: usize, end: usize) -> (&[u16], &[u16]) {
        let kv_dim = self.num_heads * self.head_dim;
        let start_offset = start * kv_dim;
        let end_offset = end * kv_dim;

        (
            &self.keys[layer][start_offset..end_offset],
            &self.values[layer][start_offset..end_offset],
        )
    }

    /// Get all valid cached keys and values for a layer (as FP32)
    #[must_use]
    pub fn get_valid_f32(&self, layer: usize) -> (Vec<f32>, Vec<f32>) {
        self.get_range_f32(layer, 0, self.valid_positions)
    }

    /// Get current number of valid cached positions
    #[must_use]
    pub fn len(&self) -> usize {
        self.valid_positions
    }

    /// Check if cache is empty
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.valid_positions == 0
    }

    /// Get maximum positions (context length)
    #[must_use]
    pub fn max_positions(&self) -> usize {
        self.max_positions
    }

    /// Reset the cache
    pub fn clear(&mut self) {
        self.position = 0;
        self.valid_positions = 0;
    }

    /// Calculate memory usage in bytes (half of FP32 version)
    #[must_use]
    pub fn memory_bytes(&self) -> usize {
        let kv_size = self.max_positions * self.num_heads * self.head_dim;
        // Keys + Values, u16 (FP16) = 2 bytes
        self.num_layers * kv_size * 2 * 2
    }

    /// Calculate memory usage in megabytes
    #[must_use]
    pub fn memory_mb(&self) -> f64 {
        self.memory_bytes() as f64 / (1024.0 * 1024.0)
    }
}

include!("streaming_kv_streaming.rs");

// =========================================================================
// FALSIFY-KV: kv-cache-sizing-v1.yaml contract (realizar StreamingKVCache)
//
// Five-Whys (PMAT-354):
//   Why 1: realizar had KV cache tests but zero FALSIFY-KV-* tests
//   Why 2: tests verify append/get, not sizing formula invariants
//   Why 3: no mapping from kv-cache-sizing-v1.yaml to realizar test names
//   Why 4: realizar predates the provable-contracts YAML convention
//   Why 5: KV sizing was "obviously correct" (simple multiplication)
//
// References:
//   - provable-contracts/contracts/kv-cache-sizing-v1.yaml
// =========================================================================
#[cfg(test)]
mod kv_contract_tests {
    use super::*;

    /// FALSIFY-KV-001: Per-token KV bytes — 2 * n_kv * d_k * sizeof(f32)
    ///
    /// Formula: memory = num_layers * max_positions * num_heads * head_dim * 2 * 4
    #[test]
    fn falsify_kv_001_memory_formula() {
        let test_cases = vec![
            // (layers, max_pos, heads, head_dim)
            (1, 1, 1, 1),
            (32, 2048, 32, 128), // 7B model
            (40, 4096, 40, 128), // 13B model
            (1, 512, 8, 64),
        ];

        for (nl, mp, nh, hd) in test_cases {
            let cache = StreamingKVCache::new(nl, mp, nh, hd);
            let expected = nl * mp * nh * hd * 2 * 4;
            assert_eq!(
                cache.memory_bytes(),
                expected,
                "FALSIFIED KV-001: memory_bytes({nl}, {mp}, {nh}, {hd}) = {}, expected {expected}",
                cache.memory_bytes()
            );
        }
    }

    /// FALSIFY-KV-002: Monotonic — longer sequence => more memory
    #[test]
    fn falsify_kv_002_monotonic_sequence_length() {
        let seq_lengths = [128, 256, 512, 1024, 2048, 4096];
        let mut prev_bytes = 0;

        for &sl in &seq_lengths {
            let cache = StreamingKVCache::new(32, sl, 32, 128);
            let bytes = cache.memory_bytes();
            assert!(
                bytes > prev_bytes,
                "FALSIFIED KV-002: memory({sl}) = {bytes} not > memory(prev) = {prev_bytes}"
            );
            prev_bytes = bytes;
        }
    }

    /// FALSIFY-KV-002b: Monotonic — more layers => more memory
    #[test]
    fn falsify_kv_002b_monotonic_layers() {
        let layer_counts = [1, 8, 16, 32, 40, 64];
        let mut prev_bytes = 0;

        for &nl in &layer_counts {
            let cache = StreamingKVCache::new(nl, 2048, 32, 128);
            let bytes = cache.memory_bytes();
            assert!(
                bytes > prev_bytes,
                "FALSIFIED KV-002b: memory(layers={nl}) = {bytes} not > {prev_bytes}"
            );
            prev_bytes = bytes;
        }
    }

    /// FALSIFY-KV-001b: FP16 cache uses exactly half the memory of FP32
    #[test]
    fn falsify_kv_001b_fp16_half_memory() {
        let nl = 32;
        let mp = 2048;
        let nh = 32;
        let hd = 128;

        let f32_cache = StreamingKVCache::new(nl, mp, nh, hd);
        let f16_cache = StreamingKVCacheFp16::new(nl, mp, nh, hd);

        let f32_bytes = f32_cache.memory_bytes();
        let f16_bytes = f16_cache.memory_bytes();

        assert_eq!(
            f16_bytes * 2,
            f32_bytes,
            "FALSIFIED KV-001b: FP16 ({f16_bytes}) * 2 != FP32 ({f32_bytes})"
        );
    }
}