ferrum-models 0.7.7

Model architectures (LLaMA, Qwen, BERT) for Ferrum inference
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
//! Multi-sequence paged-KV pool — Phase 4 of Metal paged attention.
//!
//! Replaces the per-cache_id `KvCache` allocation model with a
//! shared-pool architecture matching vLLM / mistral.rs:
//!
//! - **One pool per layer** holds K and V for *all* concurrent sequences.
//!   Sized to `MAX_TOTAL_BLOCKS × num_kv_heads × block_size × head_dim`.
//! - **Per-cache_id state** ([`PagedSeqState`]) carries that sequence's
//!   logical → physical block mapping (`block_table`) plus its current
//!   length (`len`). Multiple cache_ids can index into the same pool
//!   without colliding because their block_tables point at disjoint
//!   physical blocks (or shared ones, if prefix-caching is enabled
//!   later).
//! - **[`BlockAllocator`]** is a free-list owning physical block indices.
//!   `allocate` pops, `free` pushes back. Out-of-memory surfaces as
//!   `Result::Err` so the scheduler can refuse the request rather than
//!   panicking deep in the model forward.
//!
//! ## Ref counting (Phase 1 of block-level prefix cache)
//!
//! Each physical block has a `u16` ref count. `allocate` / `allocate_n`
//! create a block with `ref_count = 1`. `free` decrements; when the count
//! reaches zero the block returns to the free list. `acquire(block)`
//! increments — used by the prefix cache when another sequence wants to
//! share an already-resident block.
//!
//! Backwards compatible: pre-prefix-cache callers don't touch `acquire`,
//! so every block stays at ref=1 and `free` behaves identically to the
//! old single-ref free.
//!
//! ## Hash table (Phase 2 of block-level prefix cache)
//!
//! Each block can be tagged with a content-addressed `BlockHash` via
//! [`BlockAllocator::register_block_hash`]. The hash is computed by
//! chaining `parent_hash` with the block's tokens (see
//! [`block_hash_chain`]) so identical prefixes across requests produce
//! identical hash sequences.
//!
//! [`BlockAllocator::try_acquire_by_hash`] looks up a hash and bumps the
//! ref count on hit. **Soft-free blocks** (ref=0 still in `free_list`
//! with content intact) can be resurrected — this is what makes the cache
//! actually useful across requests. `allocate` evicts a stale hash when
//! a soft-free block is recycled with new content.
//!
//! ## What's *not* here yet (deferred to Phase 3-4 of prefix cache):
//! - **Engine integration**: hashing incoming prompt blocks, looking up
//!   matching prefix, splicing the existing blocks into the new seq's
//!   block_table, running prefill only on the unmatched suffix.
//! - **LRU eviction**: `free_list` is LIFO. Better policy would be
//!   LRU-soft-free so the oldest (least likely to hit) blocks are
//!   reused first. Today the most-recently-freed gets reused first
//!   (pessimistic for cross-request reuse).
//! - **Preemption**: when blocks run out we still just `Err`; real
//!   scheduler would refuse-and-queue or evict.
//! - Cross-process or cross-model pooling.

use ferrum_kernels::backend::Backend;
use ferrum_types::{FerrumError, Result, TokenId};
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::atomic::{AtomicUsize, Ordering};

/// Hash identifying a content-addressable KV block. Computed from a chain
/// of (parent_hash, block_tokens) — see [`block_hash_chain`].
pub type BlockHash = u64;

/// Compute the hash for one block, chained from its parent.
///
/// `parent` is the hash of the preceding block (0 for the first block).
/// `tokens` are the `block_size` token ids in this block.
///
/// Uses Rust's default `SipHasher` for collision resistance with no extra
/// deps. SipHash at ~1 GB/s processes a 16-token block (64 bytes) in tens
/// of ns; per-request hashing of <200 blocks is well under a microsecond
/// — invisible next to a prefill kernel.
pub fn block_hash(parent: BlockHash, tokens: &[TokenId]) -> BlockHash {
    let mut h = std::collections::hash_map::DefaultHasher::new();
    parent.hash(&mut h);
    for t in tokens {
        t.get().hash(&mut h);
    }
    h.finish()
}

/// Compute the chained hash sequence for a token list, broken into
/// `block_size`-token blocks. Trailing partial block (length <
/// `block_size`) is dropped — partial blocks can't be looked up
/// independently since their content depends on what gets appended next.
pub fn block_hash_chain(tokens: &[TokenId], block_size: usize) -> Vec<BlockHash> {
    let n = tokens.len() / block_size;
    let mut out = Vec::with_capacity(n);
    let mut parent: BlockHash = 0;
    for chunk in tokens.chunks_exact(block_size) {
        parent = block_hash(parent, chunk);
        out.push(parent);
    }
    out
}

/// LIFO free-list block allocator. `O(1)` allocate / free, no fragmentation
/// (all blocks are uniform size).
///
/// `capacity` is the total physical block count baked into the pool at
/// load time. The allocator is independent per-model (block index space
/// is not portable across models).
pub struct BlockAllocator {
    free_list: Vec<u32>,
    capacity: u32,
    /// Watermark: how many blocks have been live at peak, useful for
    /// pool-sizing diagnostics in the bench harness.
    peak_in_use: AtomicUsize,
    /// Per-block reference count. `ref_counts[i] == 0` ⟺ block i is on
    /// the free list (either freshly-uninitialized or soft-free with a
    /// hash still registered). `allocate` sets to 1; `acquire` increments;
    /// `free` decrements and (only when reaching 0) returns the block
    /// to the free list. Width 16 bits — supports up to 65535 concurrent
    /// sharers of the same physical KV block, far past any realistic
    /// continuous batching cap.
    ref_counts: Vec<u16>,
    /// Block-hash lookup. Contains entries for every block whose content
    /// has been hash-registered via [`Self::register_block_hash`] — both
    /// live (ref ≥ 1) and soft-free (ref = 0 but content not yet
    /// overwritten). [`Self::try_acquire_by_hash`] hits this map; on hit
    /// the entry is resurrected with ref = 1 if it was soft-free.
    /// Survives across requests — this is what makes the prefix cache
    /// actually useful.
    hash_table: HashMap<BlockHash, u32>,
    /// Reverse map block_id → hash, for cleanup. When a block's content
    /// gets overwritten by `allocate` returning it on a fresh request,
    /// we use this to erase its stale hash from `hash_table`.
    block_to_hash: Vec<Option<BlockHash>>,
}

impl BlockAllocator {
    /// Create a fresh allocator. All `num_blocks` blocks start free.
    /// Free-list is built so `allocate()` returns block 0 first, then 1,
    /// etc. — predictable for tests and ensures the lower physical
    /// blocks see the most reuse (better cache locality on M1's SLC).
    pub fn new(num_blocks: u32) -> Self {
        let mut free_list: Vec<u32> = (0..num_blocks).collect();
        free_list.reverse(); // pop() yields 0 first
        Self {
            free_list,
            capacity: num_blocks,
            peak_in_use: AtomicUsize::new(0),
            ref_counts: vec![0u16; num_blocks as usize],
            hash_table: HashMap::new(),
            block_to_hash: vec![None; num_blocks as usize],
        }
    }

    /// Allocate a single physical block. Returns `Err` when the pool is
    /// exhausted — caller is expected to refuse the request and queue
    /// it (or evict another seq, when that's wired up).
    ///
    /// New block starts with `ref_count = 1`. Drop one ref via `free()`
    /// to return it to the pool, or call `acquire()` for additional
    /// sharers.
    pub fn allocate(&mut self) -> Result<u32> {
        match self.pop_free_block_preferring_unhashed() {
            Some(b) => {
                debug_assert!(
                    self.ref_counts[b as usize] == 0,
                    "allocate yielded block {b} with non-zero ref_count {}",
                    self.ref_counts[b as usize]
                );
                // Soft-free block with a stale hash → content will be
                // overwritten, so erase the hash mapping.
                self.evict_hash_if_any(b);
                self.ref_counts[b as usize] = 1;
                let in_use = self.capacity as usize - self.free_list.len();
                self.peak_in_use.fetch_max(in_use, Ordering::Relaxed);
                Ok(b)
            }
            None => Err(FerrumError::resource_exhausted(format!(
                "paged KV pool exhausted (capacity={} blocks, all in use)",
                self.capacity
            ))),
        }
    }

    fn pop_free_block_preferring_unhashed(&mut self) -> Option<u32> {
        let pos = self
            .free_list
            .iter()
            .rposition(|&block| self.block_to_hash[block as usize].is_none())
            .unwrap_or_else(|| self.free_list.len().saturating_sub(1));
        if self.free_list.is_empty() {
            None
        } else {
            Some(self.free_list.swap_remove(pos))
        }
    }

    /// Internal: erase any hash mapping for a block whose content is
    /// about to be overwritten. No-op if the block had no hash registered.
    fn evict_hash_if_any(&mut self, block: u32) {
        if let Some(h) = self.block_to_hash[block as usize].take() {
            // Only erase from hash_table if it still points at THIS block.
            // (Defensive — a hash collision later registering the same h
            // to a different block could mean the entry now points
            // elsewhere; we shouldn't yank the live one.)
            if let Some(&mapped) = self.hash_table.get(&h) {
                if mapped == block {
                    self.hash_table.remove(&h);
                }
            }
        }
    }

    /// Bulk allocate. Atomic: either all `n` succeed or none are taken.
    /// Each returned block starts at `ref_count = 1`.
    pub fn allocate_n(&mut self, n: usize) -> Result<Vec<u32>> {
        if self.free_list.len() < n {
            return Err(FerrumError::resource_exhausted(format!(
                "paged KV pool exhausted: need {n} blocks but only {} free",
                self.free_list.len()
            )));
        }
        let mut out = Vec::with_capacity(n);
        for _ in 0..n {
            let b = self
                .pop_free_block_preferring_unhashed()
                .expect("free_list length checked above");
            debug_assert!(
                self.ref_counts[b as usize] == 0,
                "allocate_n yielded block {b} with non-zero ref_count"
            );
            self.evict_hash_if_any(b);
            self.ref_counts[b as usize] = 1;
            out.push(b);
        }
        let in_use = self.capacity as usize - self.free_list.len();
        self.peak_in_use.fetch_max(in_use, Ordering::Relaxed);
        Ok(out)
    }

    /// Look up a hash in the prefix-cache table. If the hash maps to a
    /// physical block (live or soft-free), bump its ref count and return
    /// the block id. A soft-free block (ref=0 in free_list) is
    /// resurrected — pulled out of the free list and ref set to 1.
    /// Returns `None` on cache miss.
    pub fn try_acquire_by_hash(&mut self, hash: BlockHash) -> Option<u32> {
        let &block = self.hash_table.get(&hash)?;
        let bi = block as usize;
        if self.ref_counts[bi] == 0 {
            // Soft-free: pull out of free_list (O(n) on free_list length;
            // typically small — few hundred at peak utilisation).
            if let Some(pos) = self.free_list.iter().rposition(|&b| b == block) {
                self.free_list.swap_remove(pos);
            } else {
                // Inconsistency: ref=0 but not in free_list. Shouldn't
                // happen unless someone bypassed the API. Treat as miss
                // to avoid corruption.
                debug_assert!(false, "block {block} has ref=0 but not in free_list");
                return None;
            }
            self.ref_counts[bi] = 1;
            let in_use = self.capacity as usize - self.free_list.len();
            self.peak_in_use.fetch_max(in_use, Ordering::Relaxed);
        } else {
            self.acquire(block);
        }
        Some(block)
    }

    /// Associate a hash with a block (after its content is written).
    /// If the block already had a hash, the old mapping is replaced.
    /// Block must currently be live (ref ≥ 1).
    pub fn register_block_hash(&mut self, block: u32, hash: BlockHash) {
        let bi = block as usize;
        debug_assert!(
            self.ref_counts[bi] > 0,
            "register_block_hash on block {block} with ref_count=0 (not allocated)"
        );
        // Replace prior hash if any.
        if let Some(old_h) = self.block_to_hash[bi].replace(hash) {
            if old_h != hash {
                if let Some(&mapped) = self.hash_table.get(&old_h) {
                    if mapped == block {
                        self.hash_table.remove(&old_h);
                    }
                }
            } else {
                // Same hash being re-registered; nothing to clean up.
                return;
            }
        }
        // Last-writer-wins on collision (same hash different content —
        // SipHash collisions on 64-token inputs are astronomically rare).
        self.hash_table.insert(hash, block);
    }

    /// Read accessor for tests / debugging.
    #[inline]
    pub fn hash_table_size(&self) -> usize {
        self.hash_table.len()
    }

    /// Increment the ref count for an already-allocated block. Used by
    /// prefix-caching paths when a new sequence wants to share a block
    /// that's already resident from a prior request.
    ///
    /// Panics in debug builds if the block is not currently allocated
    /// (ref_count==0) — a release-build path that calls `acquire` on a
    /// free block is a memory-safety bug we'd rather catch loudly.
    pub fn acquire(&mut self, block: u32) {
        let bi = block as usize;
        debug_assert!(
            self.ref_counts[bi] > 0,
            "acquire on block {block} with ref_count=0 (not currently allocated)"
        );
        self.ref_counts[bi] = self.ref_counts[bi]
            .checked_add(1)
            .expect("BlockAllocator ref_count u16 overflow (>65535 sharers)");
    }

    /// Bulk acquire — convenience for prefix-cache hit paths that take a
    /// list of physical block ids to share.
    pub fn acquire_many(&mut self, blocks: &[u32]) {
        for &b in blocks {
            self.acquire(b);
        }
    }

    /// Drop one ref from each block. Blocks whose ref_count hits 0 are
    /// returned to the free list and become available for the next
    /// `allocate*` call.
    ///
    /// Pre-prefix-cache callers see no behavioural change: every block
    /// they hold starts at ref=1 (from `allocate`), and `free` drops it
    /// to 0 — physically frees on the same call, same as before.
    pub fn free(&mut self, blocks: &[u32]) {
        for &b in blocks {
            let bi = b as usize;
            debug_assert!(
                self.ref_counts[bi] > 0,
                "free on block {b} with ref_count=0 (double-free)"
            );
            self.ref_counts[bi] -= 1;
            if self.ref_counts[bi] == 0 {
                self.free_list.push(b);
            }
        }
    }

    /// Read current ref count for a block. 0 means free, ≥1 means in
    /// use by that many sequences.
    #[inline]
    pub fn ref_count(&self, block: u32) -> u16 {
        self.ref_counts[block as usize]
    }

    pub fn free_count(&self) -> usize {
        self.free_list.len()
    }

    pub fn capacity(&self) -> u32 {
        self.capacity
    }

    pub fn peak_in_use(&self) -> usize {
        self.peak_in_use.load(Ordering::Relaxed)
    }
}

/// Per-sequence paged-KV state.
///
/// Holds the logical→physical block mapping for ONE sequence (one
/// `cache_id`) plus its current token count. The mapping is stored as
/// both:
/// - `blocks: Vec<u32>` — the host-side source of truth, used by the
///   block allocator + grow logic.
/// - `block_table_buf: B::Buffer` — a device-side u32 buffer that mirrors
///   `blocks` and is read directly by the paged Metal kernels (PR #68 /
///   #69). Kept in sync via [`Self::ensure_capacity`].
///
/// `context_lens_buf` is a 1-element u32 device buffer holding `len`.
/// The kernel reads it each forward; we update it via `B::write_u32`.
pub struct PagedSeqState<B: Backend> {
    pub blocks: Vec<u32>,
    pub block_table_buf: B::Buffer,
    pub context_lens_buf: B::Buffer,
    pub len: usize,
    pub block_size: usize,
    pub max_blocks_per_seq: usize,
}

impl<B: Backend> PagedSeqState<B> {
    /// Allocate buffers for a sequence that hasn't yet allocated any
    /// blocks. The allocator isn't touched here — the first call to
    /// [`Self::ensure_capacity`] does the real work.
    pub fn new(block_size: usize, max_blocks_per_seq: usize) -> Self {
        let block_table_buf =
            B::alloc_typed(ferrum_kernels::backend::Dtype::U32, max_blocks_per_seq);
        let context_lens_buf = B::alloc_typed(ferrum_kernels::backend::Dtype::U32, 1);
        // Initialise context_lens to 0 so a forward dispatched before
        // any token has been written sees an empty context.
        let mut ctx = B::new_context();
        let mut cl = context_lens_buf;
        B::write_typed::<u32>(&mut ctx, &mut cl, &[0u32]);
        B::sync(&mut ctx);
        Self {
            blocks: Vec::with_capacity(max_blocks_per_seq),
            block_table_buf,
            context_lens_buf: cl,
            len: 0,
            block_size,
            max_blocks_per_seq,
        }
    }

    /// Ensure the seq has enough blocks to hold `target_len` tokens.
    /// Allocates additional blocks from the pool if needed and re-syncs
    /// `block_table_buf` to the device. Idempotent if already big enough.
    pub fn ensure_capacity(
        &mut self,
        ctx: &mut B::Context,
        alloc: &mut BlockAllocator,
        target_len: usize,
    ) -> Result<()> {
        let needed = target_len.div_ceil(self.block_size);
        if needed > self.max_blocks_per_seq {
            return Err(FerrumError::model(format!(
                "paged KV: target_len={target_len} would need {needed} blocks, exceeds max_blocks_per_seq={}",
                self.max_blocks_per_seq
            )));
        }
        while self.blocks.len() < needed {
            let block = alloc.allocate()?;
            self.blocks.push(block);
        }
        // Mirror the host-side blocks list into the device buffer. We
        // write the FULL `max_blocks_per_seq` entries — unused slots
        // beyond `needed` are never read by the kernel (it only walks
        // `[0, ceil(context_len / block_size))`), but writing them
        // keeps the buffer's content predictable.
        let mut padded = self.blocks.clone();
        padded.resize(self.max_blocks_per_seq, 0);
        B::write_typed::<u32>(ctx, &mut self.block_table_buf, &padded);
        Ok(())
    }

    /// Update the on-device `context_lens_buf` to the current `self.len`.
    /// Call this after [`Self::ensure_capacity`] but before dispatching
    /// the paged attention kernel for this seq.
    pub fn sync_context_len(&mut self, ctx: &mut B::Context) {
        B::write_typed::<u32>(ctx, &mut self.context_lens_buf, &[self.len as u32]);
    }

    /// Release all blocks back to the allocator. Buffers are kept (cheap
    /// to reuse for a future cache_id), but blocks become available for
    /// other sequences. Sets `len` back to 0.
    pub fn release(&mut self, alloc: &mut BlockAllocator) {
        alloc.free(&self.blocks);
        self.blocks.clear();
        self.len = 0;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn allocator_basic() {
        let mut a = BlockAllocator::new(4);
        assert_eq!(a.free_count(), 4);
        assert_eq!(a.allocate().unwrap(), 0);
        assert_eq!(a.allocate().unwrap(), 1);
        assert_eq!(a.allocate().unwrap(), 2);
        assert_eq!(a.allocate().unwrap(), 3);
        assert!(a.allocate().is_err());
        assert_eq!(a.free_count(), 0);

        a.free(&[1, 3]);
        assert_eq!(a.free_count(), 2);
        // LIFO: most recently freed comes back first.
        assert_eq!(a.allocate().unwrap(), 3);
        assert_eq!(a.allocate().unwrap(), 1);
    }

    #[test]
    fn allocator_atomic_n_failure() {
        let mut a = BlockAllocator::new(3);
        let _ = a.allocate().unwrap(); // 1 left in free_list... wait, 2 left
        let _ = a.allocate().unwrap();
        // 1 free, asking for 2 should fail without consuming the 1.
        assert!(a.allocate_n(2).is_err());
        assert_eq!(a.free_count(), 1);
    }

    #[test]
    fn allocator_peak_tracking() {
        let mut a = BlockAllocator::new(8);
        let blocks = a.allocate_n(5).unwrap();
        assert_eq!(a.peak_in_use(), 5);
        a.free(&blocks);
        assert_eq!(a.peak_in_use(), 5); // peak doesn't decrease
        let _ = a.allocate_n(3).unwrap();
        assert_eq!(a.peak_in_use(), 5);
    }

    #[test]
    fn refcount_allocate_starts_at_one() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        assert_eq!(a.ref_count(b), 1);
    }

    #[test]
    fn refcount_acquire_increments() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        a.acquire(b);
        a.acquire(b);
        assert_eq!(a.ref_count(b), 3);
    }

    #[test]
    fn refcount_free_decrements_no_physical_release() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        a.acquire(b); // ref=2
        a.free(&[b]); // ref=1 — NOT yet returned to free_list
        assert_eq!(a.ref_count(b), 1);
        assert_eq!(a.free_count(), 3); // only 3 of original 4 blocks free
    }

    #[test]
    fn refcount_free_physical_release_at_zero() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        a.acquire(b); // ref=2
        a.free(&[b]); // ref=1
        a.free(&[b]); // ref=0 → physical release
        assert_eq!(a.ref_count(b), 0);
        assert_eq!(a.free_count(), 4);
    }

    #[test]
    fn refcount_legacy_single_ref_behaviour_unchanged() {
        // Pre-prefix-cache code never calls `acquire`. Verify that
        // allocate+free behaves identically to the old version: block
        // immediately returns to the pool.
        let mut a = BlockAllocator::new(2);
        let b = a.allocate().unwrap();
        assert_eq!(a.free_count(), 1);
        a.free(&[b]);
        assert_eq!(a.free_count(), 2);
        assert_eq!(a.ref_count(b), 0);
        // Re-allocation reuses the slot (LIFO).
        let b2 = a.allocate().unwrap();
        assert_eq!(b2, b);
    }

    #[test]
    fn refcount_bulk_acquire_and_release() {
        let mut a = BlockAllocator::new(8);
        let blocks = a.allocate_n(3).unwrap();
        a.acquire_many(&blocks); // each ref=2
        for &b in &blocks {
            assert_eq!(a.ref_count(b), 2);
        }
        a.free(&blocks); // each ref=1, none physically released
        assert_eq!(a.free_count(), 5);
        a.free(&blocks); // each ref=0, all released
        assert_eq!(a.free_count(), 8);
    }

    // ── Phase 2: hash table tests ────────────────────────────────────────

    fn toks(ids: &[u32]) -> Vec<TokenId> {
        ids.iter().map(|&i| TokenId::new(i)).collect()
    }

    #[test]
    fn block_hash_chain_basic() {
        let tokens = toks(&[1, 2, 3, 4, 5, 6, 7, 8]);
        let chain = block_hash_chain(&tokens, 4);
        assert_eq!(chain.len(), 2);
        // Identical second block must produce a DIFFERENT chained hash
        // because its parent (first block) is the same here but the
        // chain order forces hash[1] = h(hash[0], tokens[4..8]).
        assert_ne!(chain[0], chain[1]);
    }

    #[test]
    fn block_hash_chain_identical_prefix_same_hashes() {
        let a = toks(&[10, 20, 30, 40, 50, 60, 70, 80]);
        let b = toks(&[10, 20, 30, 40, 99, 99, 99, 99]);
        let ca = block_hash_chain(&a, 4);
        let cb = block_hash_chain(&b, 4);
        assert_eq!(ca[0], cb[0], "first block matches → same hash[0]");
        assert_ne!(ca[1], cb[1], "second block differs → different hash[1]");
    }

    #[test]
    fn block_hash_chain_drops_partial_trailing() {
        let tokens = toks(&[1, 2, 3, 4, 5, 6, 7]); // 7 tokens, block_size=4 → 1 full
        let chain = block_hash_chain(&tokens, 4);
        assert_eq!(chain.len(), 1);
    }

    #[test]
    fn hash_table_register_and_acquire_live() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        let h = 0xDEAD_BEEFu64;
        a.register_block_hash(b, h);
        assert_eq!(a.hash_table_size(), 1);
        // Re-acquire by hash: must bump ref (block is live)
        let got = a.try_acquire_by_hash(h);
        assert_eq!(got, Some(b));
        assert_eq!(a.ref_count(b), 2);
    }

    #[test]
    fn hash_table_soft_free_resurrection() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        let h = 0xABCD_1234u64;
        a.register_block_hash(b, h);
        a.free(&[b]); // ref=0, block in free_list, hash STILL registered
        assert_eq!(a.free_count(), 4);
        assert_eq!(a.ref_count(b), 0);
        // Cache hit on the soft-free block: must resurrect with ref=1
        let got = a.try_acquire_by_hash(h);
        assert_eq!(got, Some(b));
        assert_eq!(a.ref_count(b), 1);
        // The block is OFF the free_list now.
        assert_eq!(a.free_count(), 3);
    }

    #[test]
    fn hash_table_miss_returns_none() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        a.register_block_hash(b, 1);
        assert_eq!(a.try_acquire_by_hash(99), None);
        // No side effects on miss
        assert_eq!(a.ref_count(b), 1);
    }

    #[test]
    fn hash_table_evicts_on_realloc() {
        // Allocate, register hash, free → soft-free with hash. Then
        // allocate a new block — the recycled one's hash must be erased
        // because its content is about to be overwritten.
        let mut a = BlockAllocator::new(2);
        let b1 = a.allocate().unwrap();
        let _held_clean_block = a.allocate().unwrap();
        let h = 0x1234u64;
        a.register_block_hash(b1, h);
        a.free(&[b1]); // soft-free
        assert_eq!(a.hash_table_size(), 1);
        // Allocate to consume the only free block — should recycle b1 and
        // evict its hash.
        let b2 = a.allocate().unwrap();
        assert_eq!(b2, b1, "must reuse b1 when no clean free block exists");
        assert_eq!(a.hash_table_size(), 0, "stale hash erased on realloc");
        assert_eq!(a.try_acquire_by_hash(h), None);
    }

    #[test]
    fn allocator_prefers_unhashed_free_block_over_soft_cached_hash() {
        let mut a = BlockAllocator::new(3);
        let cached = a.allocate().unwrap();
        let h = 0xCAFE_BABEu64;
        a.register_block_hash(cached, h);
        a.free(&[cached]);
        assert_eq!(a.hash_table_size(), 1);

        let fresh = a.allocate().unwrap();
        assert_ne!(fresh, cached, "clean free block should be used first");
        assert_eq!(a.hash_table_size(), 1, "cached block hash must survive");
        assert_eq!(a.try_acquire_by_hash(h), Some(cached));
    }

    #[test]
    fn hash_table_replace_block_hash() {
        let mut a = BlockAllocator::new(4);
        let b = a.allocate().unwrap();
        a.register_block_hash(b, 1);
        a.register_block_hash(b, 2); // overwrite
        assert_eq!(a.hash_table_size(), 1);
        assert_eq!(a.try_acquire_by_hash(1), None);
        assert_eq!(a.try_acquire_by_hash(2), Some(b));
    }
}