dynamo_tokens/
lib.rs

1// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4#![deny(missing_docs)]
5
6//! Types and utilities for handling sequences of tokens, including block creation and hashing.
7
8use bytemuck::cast_slice;
9use derive_getters::Dissolve;
10use std::ops::Range;
11
12pub mod blocks;
13mod radix;
14pub use radix::PositionalRadixTree;
15
16/// Trait for hashes that include position information.
17pub trait PositionalHash {
18    /// Returns the position associated with the hash.
19    fn position(&self) -> u64;
20}
21
22/// A token is represented as a 32-bit unsigned integer.
23pub type Token = u32;
24
25/// A salt used for hashing, represented as a vector of bytes.
26/// This might encode model architecture, weights, PEFT info, etc.
27pub type Salt = Vec<u8>;
28
29/// A 64-bit hash of the salt. Computed once per request and used as the seed for
30/// every block-hash computation in that request.
31///
32/// The canonical construction path is [`compute_salt_hash_from_bytes`] (or
33/// `dynamo_kv_hashing::Request::salt_hash` at the application layer).
34pub type SaltHash = u64;
35
36/// A 64-bit hash computed from the tokens within a single block (with optional MM
37/// frames), seeded by the request's [`SaltHash`].
38///
39/// The canonical construction path is [`compute_block_hash`].
40pub type BlockHash = u64;
41
42/// A 64-bit sequence-aware hash. Equals the [`BlockHash`] at position 0 and
43/// [`compute_next_sequence_hash(prev_seq, block_hash)`](compute_next_sequence_hash)
44/// at every subsequent position. Salt propagates through `seq_hash[0]` since
45/// `block_hash[0]` already encodes it.
46pub type SequenceHash = u64;
47
48/// Computes a hash of the data using the given seed (raw u64).
49///
50/// Prefer [`compute_block_hash`] / [`compute_salt_hash_from_bytes`] for typed
51/// construction; this raw-u64 form is kept for low-level callers.
52pub fn compute_hash_v2(data: &[u8], seed: u64) -> u64 {
53    xxhash_rust::xxh3::xxh3_64_with_seed(data, seed)
54}
55
56/// Canonical XXH3 seed used by every chain-step `(parent_seq, child_block_hash) → next_seq`
57/// in this codebase. Must match `dynamo_kv_router::protocols::XXH3_SEED` — the router's
58/// `compute_seq_hash_for_block` and the `PositionalIndexer`'s chain re-computation both
59/// route through [`compute_next_sequence_hash`] (or equivalently seeded helpers in
60/// `kv-router`), so changing this constant requires all of them to flip in lockstep.
61pub const CHAIN_XXH3_SEED: u64 = 1337;
62
63/// Chain-step for sequence hashing: returns the [`SequenceHash`] at `position + 1`
64/// given the parent's `SequenceHash` and the child block's [`BlockHash`].
65///
66/// This is the single source of truth for the chain recurrence used by:
67/// - [`PositionalLineageHash::extend`]
68/// - [`TokenBlock::from_chunk`]
69/// - `dynamo_kv_router::protocols::compute_next_seq_hash` (request side)
70/// - `dynamo_kv_router::indexer::PositionalIndexer` (chain re-validation)
71///
72/// Salt is already mixed into `block_hash[0]` and propagates through every parent, so
73/// the per-step seed is a constant: re-feeding salt at each step would be redundant.
74/// Any constant seed preserves PLH composability — the value is set to
75/// [`CHAIN_XXH3_SEED`] to match the router's existing wire format.
76#[inline]
77pub fn compute_next_sequence_hash(
78    parent_sequence_hash: SequenceHash,
79    child_block_hash: BlockHash,
80) -> SequenceHash {
81    let combined = [parent_sequence_hash, child_block_hash];
82    compute_hash_v2(cast_slice(&combined), CHAIN_XXH3_SEED)
83}
84
85/// Custom serde codec that encodes a `u128` as a 16-byte big-endian byte sequence.
86///
87/// MessagePack (`rmp-serde`) has no native 128-bit integer type, so the default
88/// `u128` derive does not roundtrip reliably. Encoding as raw bytes is supported
89/// uniformly across msgpack, JSON, CBOR, etc.
90mod serde_bytes_u128 {
91    use serde::{Deserializer, Serializer};
92
93    pub fn serialize<S>(val: &u128, serializer: S) -> Result<S::Ok, S::Error>
94    where
95        S: Serializer,
96    {
97        serializer.serialize_bytes(&val.to_be_bytes())
98    }
99
100    pub fn deserialize<'de, D>(deserializer: D) -> Result<u128, D::Error>
101    where
102        D: Deserializer<'de>,
103    {
104        use serde::de::{self, SeqAccess, Visitor};
105        use std::fmt;
106
107        struct V;
108        impl<'de> Visitor<'de> for V {
109            type Value = [u8; 16];
110
111            fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
112                f.write_str("16 bytes (msgpack bin) or a sequence of 16 u8 values")
113            }
114
115            fn visit_bytes<E: de::Error>(self, v: &[u8]) -> Result<[u8; 16], E> {
116                v.try_into()
117                    .map_err(|_| E::invalid_length(v.len(), &"16 bytes"))
118            }
119
120            fn visit_borrowed_bytes<E: de::Error>(self, v: &'de [u8]) -> Result<[u8; 16], E> {
121                self.visit_bytes(v)
122            }
123
124            fn visit_byte_buf<E: de::Error>(self, v: Vec<u8>) -> Result<[u8; 16], E> {
125                self.visit_bytes(&v)
126            }
127
128            fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<[u8; 16], A::Error> {
129                let mut arr = [0u8; 16];
130                for (i, slot) in arr.iter_mut().enumerate() {
131                    *slot = seq
132                        .next_element()?
133                        .ok_or_else(|| de::Error::invalid_length(i, &"16 u8 elements"))?;
134                }
135                Ok(arr)
136            }
137        }
138
139        let arr = deserializer.deserialize_bytes(V)?;
140        Ok(u128::from_be_bytes(arr))
141    }
142}
143
144/// Canonical [`BlockHash`] construction: XXH3 over the per-block byte buffer
145/// (already encoded by [`compute_block_bytes_with_mm`] or `cast_slice` for the
146/// no-MM path), seeded by [`SaltHash`].
147#[inline]
148pub fn compute_block_hash(block_bytes: &[u8], salt: SaltHash) -> BlockHash {
149    compute_hash_v2(block_bytes, salt)
150}
151
152/// Canonical [`SaltHash`] construction from a pre-canonicalized salt-payload byte
153/// buffer. Application-layer callers should use `dynamo_kv_hashing::Request::salt_hash`
154/// which canonicalizes `(salt, lora_name)` first; this function is the low-level path.
155#[inline]
156pub fn compute_salt_hash_from_bytes(payload: &[u8]) -> SaltHash {
157    compute_hash_v2(payload, 0)
158}
159
160/// Metadata describing a single multimodal placeholder run within a token sequence.
161///
162/// A run occupies `length` consecutive slots starting at `offset`. The token IDs at
163/// those slot positions are opaque for hashing — the `(mm_hash, run_offset)` pair drives
164/// the per-slot bytes during block formation.
165///
166/// See [`compute_block_bytes_with_mm`] for the byte-encoding rule.
167#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
168pub struct TokenBlockMmInfo {
169    /// Hash identifying the multimodal object (image / audio / etc.).
170    pub mm_hash: u64,
171    /// Start position of the placeholder run in the full token sequence (zero-based).
172    pub offset: usize,
173    /// Number of placeholder slots in the run.
174    pub length: usize,
175}
176
177/// Slot-tag byte distinguishing real-token slots from multimodal placeholder slots in
178/// the per-block byte buffer. See [`compute_block_bytes_with_mm`].
179pub const MM_SLOT_TAG_TOKEN: u8 = 0x00;
180/// Slot-tag byte for multimodal placeholder slots. See [`compute_block_bytes_with_mm`].
181pub const MM_SLOT_TAG_PLACEHOLDER: u8 = 0x01;
182
183impl TokenBlockMmInfo {
184    /// Returns the exclusive end position of this run, or `None` on `usize` overflow.
185    #[inline]
186    pub fn checked_end(&self) -> Option<usize> {
187        self.offset.checked_add(self.length)
188    }
189
190    /// Returns the exclusive end position of this run.
191    ///
192    /// # Panics
193    /// Panics if `offset + length` overflows `usize`. Prefer [`Self::checked_end`] in
194    /// validation paths; this helper is for already-validated runs.
195    #[inline]
196    pub fn end(&self) -> usize {
197        self.checked_end()
198            .expect("TokenBlockMmInfo::end overflowed usize; run was not validated")
199    }
200
201    /// Returns `true` if the given absolute position falls inside this run.
202    /// Returns `false` if the run's end overflows `usize` (such a run is invalid; use
203    /// [`validate_and_sort_mm_info`] before relying on this method).
204    #[inline]
205    pub fn covers(&self, position: usize) -> bool {
206        match self.checked_end() {
207            Some(end) => position >= self.offset && position < end,
208            None => false,
209        }
210    }
211}
212
213/// Errors raised while validating [`TokenBlockMmInfo`] inputs.
214#[derive(Debug, Clone, Copy, PartialEq, Eq, thiserror::Error)]
215pub enum MmInfoError {
216    /// The run extends past the end of the token sequence.
217    #[error(
218        "mm_info range starting at {offset} (length {length}) exceeds tokens length {tokens_len}"
219    )]
220    OutOfBounds {
221        /// Run start.
222        offset: usize,
223        /// Run length.
224        length: usize,
225        /// Length of the token sequence the run was validated against.
226        tokens_len: usize,
227    },
228    /// `offset + length` overflows `usize`.
229    #[error("mm_info range starting at {offset} (length {length}) overflows usize")]
230    OffsetOverflow {
231        /// Run start.
232        offset: usize,
233        /// Run length.
234        length: usize,
235    },
236    /// Two runs overlap.
237    #[error("mm_info ranges overlap at position {position}")]
238    Overlapping {
239        /// Position where the overlap begins.
240        position: usize,
241    },
242    /// A run has zero length.
243    #[error("mm_info length must be greater than zero")]
244    EmptyRun,
245}
246
247/// Validates `mm_info` against `tokens_len` and returns a copy sorted by `offset`.
248///
249/// Validation rules:
250/// - Every run must have `length > 0`.
251/// - `offset + length` must not overflow `usize`.
252/// - Every run's end (`offset + length`) must be `<= tokens_len`.
253/// - No two runs may overlap.
254pub fn validate_and_sort_mm_info(
255    mm_info: &[TokenBlockMmInfo],
256    tokens_len: usize,
257) -> Result<Vec<TokenBlockMmInfo>, MmInfoError> {
258    let mut sorted: Vec<TokenBlockMmInfo> = mm_info.to_vec();
259    sorted.sort_by_key(|m| m.offset);
260    let mut prev_end = 0usize;
261    for m in &sorted {
262        if m.length == 0 {
263            return Err(MmInfoError::EmptyRun);
264        }
265        let end = m
266            .offset
267            .checked_add(m.length)
268            .ok_or(MmInfoError::OffsetOverflow {
269                offset: m.offset,
270                length: m.length,
271            })?;
272        if end > tokens_len {
273            return Err(MmInfoError::OutOfBounds {
274                offset: m.offset,
275                length: m.length,
276                tokens_len,
277            });
278        }
279        if m.offset < prev_end {
280            return Err(MmInfoError::Overlapping { position: m.offset });
281        }
282        prev_end = end;
283    }
284    Ok(sorted)
285}
286
287/// Returns `true` if any run in `mm_runs` overlaps the block `[block_offset, block_offset + len)`.
288/// `mm_runs` must be validated and sorted.
289fn block_has_mm(block_offset: usize, len: usize, mm_runs: &[TokenBlockMmInfo]) -> bool {
290    let block_end = block_offset.saturating_add(len);
291    mm_runs
292        .iter()
293        .any(|m| m.offset < block_end && m.end() > block_offset)
294}
295
296/// Builds the byte buffer used to compute a block's [`BlockHash`].
297///
298/// **Two encodings, picked per-block:**
299///
300/// 1. **Legacy / zero-MM** — when no run in `mm_runs` overlaps this block, the buffer is
301///    `bytemuck::cast_slice(tokens)` (4 bytes per slot, LE u32). This matches the existing
302///    `compute_hash_v2(cast_slice(&tokens), salt_hash)` path used by every existing
303///    [`TokenBlock`] in `dynamo_tokens` and kvbm — preserving cache identity for any block
304///    that is not itself MM-affected.
305///
306/// 2. **Tagged / MM-affected** — when at least one run overlaps this block, every slot
307///    emits a fixed 13-byte frame:
308///    - Real-token slot: `[MM_SLOT_TAG_TOKEN | token_id u32 LE | 0u64 LE]`
309///      The trailing `0u64` is **frame padding only** — it has no semantic meaning,
310///      it is there so the real-token frame matches the placeholder frame's width.
311///      Token IDs at placeholder positions are *ignored* by this encoder; whether
312///      a slot is a placeholder is determined solely by `mm_runs[run_idx].covers(g)`.
313///    - Placeholder slot: `[MM_SLOT_TAG_PLACEHOLDER | run_offset u32 LE | mm_hash u64 LE]`,
314///      where `run_offset = (block_offset + s) - run.offset`.
315///
316///    The 1-byte tag plus the fixed-width frame make the encoding self-delimiting and
317///    slot-position-preserving: two MM-affected byte buffers compare equal iff they describe
318///    the same `(slot_kind, slot_payload)` sequence at the same slot positions.
319///
320/// The two encodings have different per-slot widths (4 vs 13), so an all-tokens block can
321/// never produce the same byte buffer as an MM-affected block — eliminating cross-encoding
322/// collisions.
323///
324/// `mm_runs` must be validated and sorted (typically the output of
325/// [`validate_and_sort_mm_info`]).
326pub fn compute_block_bytes_with_mm(
327    tokens: &[Token],
328    block_offset: usize,
329    mm_runs: &[TokenBlockMmInfo],
330) -> Vec<u8> {
331    // Defense-in-depth: routing of each slot to the placeholder vs real-token branch
332    // depends on `mm_runs` being sorted by offset and non-overlapping. The public
333    // entry points (`Request::new`, `TokenBlockSequence::new_with_mm`,
334    // `split_tokens_with_mm`) enforce this via `validate_and_sort_mm_info`, but this
335    // function is also pub so we re-check in debug to catch direct misuse.
336    debug_assert!(
337        mm_runs.windows(2).all(|w| w[0].end() <= w[1].offset),
338        "compute_block_bytes_with_mm: mm_runs must be sorted by offset and non-overlapping (use validate_and_sort_mm_info)",
339    );
340    debug_assert!(
341        mm_runs.iter().all(|r| r.length > 0),
342        "compute_block_bytes_with_mm: mm_runs must have non-zero length",
343    );
344
345    if !block_has_mm(block_offset, tokens.len(), mm_runs) {
346        // Zero-MM-affecting-this-block: use the legacy encoding so the resulting block_hash
347        // matches what TokenBlockSequence::new would produce.
348        return cast_slice::<Token, u8>(tokens).to_vec();
349    }
350
351    const FRAME: usize = 13;
352    let mut out: Vec<u8> = Vec::with_capacity(tokens.len() * FRAME);
353    let mut run_idx = 0usize;
354    // Skip runs that ended at or before this block starts.
355    while run_idx < mm_runs.len() && mm_runs[run_idx].end() <= block_offset {
356        run_idx += 1;
357    }
358    for (s, &tok) in tokens.iter().enumerate() {
359        let g = block_offset + s;
360        // Advance past runs that end at or before g (validated non-overlapping => monotonic).
361        while run_idx < mm_runs.len() && mm_runs[run_idx].end() <= g {
362            run_idx += 1;
363        }
364        if run_idx < mm_runs.len() && mm_runs[run_idx].covers(g) {
365            let run = &mm_runs[run_idx];
366            let run_offset = (g - run.offset) as u32;
367            out.push(MM_SLOT_TAG_PLACEHOLDER);
368            out.extend_from_slice(&run_offset.to_le_bytes());
369            out.extend_from_slice(&run.mm_hash.to_le_bytes());
370        } else {
371            out.push(MM_SLOT_TAG_TOKEN);
372            out.extend_from_slice(&tok.to_le_bytes());
373            out.extend_from_slice(&0u64.to_le_bytes());
374        }
375    }
376    out
377}
378
379/// A 128-bit positional sequence hash combining traditional sequence hash with positional information.
380///
381/// Layout:
382/// - Lower 64 bits: Traditional SequenceHash
383/// - Upper 64 bits: 2-bit mode + position + LocalBlockHash (BlockHash)
384///
385/// Modes (automatically selected based on position):
386/// - Mode 00: 8-bit position (max 255) + 54-bit LBH
387/// - Mode 01: 16-bit position (max 65,535) + 46-bit LBH
388/// - Mode 10: 24-bit position (max 16,777,215) + 38-bit LBH
389/// - Mode 11: 31-bit position (max 2,147,483,647) + 31-bit LBH
390#[derive(Clone, Copy, PartialEq, Eq, Hash, Default, serde::Serialize, serde::Deserialize)]
391#[serde(transparent)]
392pub struct PositionalSequenceHash(#[serde(with = "serde_bytes_u128")] u128);
393
394impl PositionalSequenceHash {
395    /// Creates a new PositionalSequenceHash from components.
396    ///
397    /// The mode is automatically selected based on the position value to use the minimal
398    /// representation that can fit the position.
399    pub fn new(sequence_hash: SequenceHash, position: u64, local_block_hash: BlockHash) -> Self {
400        let mode = Self::select_mode(position);
401        let upper = Self::encode_upper(mode, position, local_block_hash);
402        let value = ((upper as u128) << 64) | (sequence_hash as u128);
403        PositionalSequenceHash(value)
404    }
405
406    /// Returns the sequence hash component (lower 64 bits).
407    pub fn sequence_hash(&self) -> SequenceHash {
408        (self.0 & 0xFFFF_FFFF_FFFF_FFFF) as u64
409    }
410
411    /// Returns the block position.
412    pub fn position(&self) -> u64 {
413        let (_, position, _) = self.decode_upper();
414        position
415    }
416
417    /// Returns the local block hash (BlockHash) component.
418    pub fn local_block_hash(&self) -> BlockHash {
419        let (_, _, lbh) = self.decode_upper();
420        lbh
421    }
422
423    /// Returns the mode used for encoding (0, 1, 2, or 3).
424    pub fn mode(&self) -> u8 {
425        let (mode, _, _) = self.decode_upper();
426        mode
427    }
428
429    /// Returns the inner 128-bit value.
430    #[inline(always)]
431    pub fn as_u128(&self) -> u128 {
432        self.0
433    }
434
435    /// Selects the minimal mode that can represent the given position.
436    fn select_mode(position: u64) -> u8 {
437        if position < (1u64 << 8) {
438            0 // Mode 00: 8-bit position
439        } else if position < (1u64 << 16) {
440            1 // Mode 01: 16-bit position
441        } else if position < (1u64 << 24) {
442            2 // Mode 10: 24-bit position
443        } else if position < (1u64 << 31) {
444            3 // Mode 11: 31-bit position
445        } else {
446            panic!(
447                "Position {} exceeds maximum supported value (2^31 - 1)",
448                position
449            );
450        }
451    }
452
453    /// Encodes the upper 64 bits from mode, position, and local block hash.
454    fn encode_upper(mode: u8, position: u64, local_block_hash: u64) -> u64 {
455        let (position_bits, lbh_bits) = match mode {
456            0 => (8, 54),  // 2 + 8 + 54 = 64
457            1 => (16, 46), // 2 + 16 + 46 = 64
458            2 => (24, 38), // 2 + 24 + 38 = 64
459            3 => (31, 31), // 2 + 31 + 31 = 64
460            _ => unreachable!(
461                "Invalid mode {} when encoding PositionalSequenceHash; mode must be 0, 1, 2, or 3",
462                mode
463            ),
464        };
465
466        // Create masks for extracting the relevant bits
467        let position_mask = (1u64 << position_bits) - 1;
468        let lbh_mask = (1u64 << lbh_bits) - 1;
469
470        // Extract and position components
471        let position_part = position & position_mask;
472        let lbh_part = local_block_hash & lbh_mask;
473
474        // Combine: [mode (2 bits)][position (X bits)][lbh (R bits)]
475        ((mode as u64) << 62) | (position_part << lbh_bits) | lbh_part
476    }
477
478    /// Decodes the upper 64 bits into (mode, position, local_block_hash).
479    fn decode_upper(&self) -> (u8, u64, u64) {
480        let upper = (self.0 >> 64) as u64;
481
482        // Extract mode from top 2 bits
483        let mode = (upper >> 62) as u8;
484
485        let (position_bits, lbh_bits) = match mode {
486            0 => (8, 54),
487            1 => (16, 46),
488            2 => (24, 38),
489            3 => (31, 31),
490            _ => unreachable!(
491                "Invalid mode {} in PositionalSequenceHash - value may be corrupted",
492                mode
493            ),
494        };
495
496        // Create masks
497        let lbh_mask = (1u64 << lbh_bits) - 1;
498        let position_mask = (1u64 << position_bits) - 1;
499
500        // Extract components
501        let lbh = upper & lbh_mask;
502        let position = (upper >> lbh_bits) & position_mask;
503
504        (mode, position, lbh)
505    }
506}
507
508impl std::fmt::Debug for PositionalSequenceHash {
509    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
510        f.debug_struct("PositionalSequenceHash")
511            .field("sequence_hash", &self.sequence_hash())
512            .field("local_block_hash", &self.local_block_hash())
513            .field("position", &self.position())
514            .finish()
515    }
516}
517
518/// A 128-bit positional lineage hash encoding parental lineage for tree traversal.
519///
520/// Layout (using full 128 bits):
521/// - Mode (2 bits): Determines position field size
522/// - Position (8/16/24 bits): Block position in sequence
523/// - Current Sequence Hash (64 bits): Full u64 sequence hash for this block
524/// - Parent Fragment (variable bits): Truncated lower bits of the parent's sequence hash
525///
526/// Modes (automatically selected based on position):
527/// - Mode 00: 8-bit position (max 255) + 64-bit current + 54-bit parent fragment
528/// - Mode 01: 16-bit position (max 65,535) + 64-bit current + 46-bit parent fragment
529/// - Mode 10: 24-bit position (max 16,777,215) + 64-bit current + 38-bit parent fragment
530///
531/// Carrying the full 64-bit current sequence hash inline makes PLH self-contained for
532/// chain extension: a child PLH can be derived from a parent PLH plus the child's
533/// `BlockHash` alone, with no out-of-band state. The parent fragment shrinks as
534/// position grows, but radix backward-traversal always matches on
535/// `(position, parent_fragment)`, and the probability of a shared parent at high
536/// positions decreases faster than the fragment-collision bound rises.
537#[derive(Clone, Copy, PartialEq, Eq, Hash, Default, serde::Serialize, serde::Deserialize)]
538#[serde(transparent)]
539pub struct PositionalLineageHash(#[serde(with = "serde_bytes_u128")] u128);
540
541impl PositionalLineageHash {
542    /// Creates a new PositionalLineageHash from components.
543    ///
544    /// The mode is automatically selected based on the position value to use the minimal
545    /// representation that can fit the position.
546    ///
547    /// # Arguments
548    ///
549    /// * `current_seq_hash` - The full u64 sequence hash of the current block
550    /// * `parent_seq_hash` - The full u64 sequence hash of the parent block (None for root).
551    ///   Will be truncated to this position's parent-fragment width.
552    /// * `position` - The block position in the sequence
553    ///
554    /// # Panics
555    ///
556    /// Panics if position >= 2^24 (16,777,216).
557    pub fn new(
558        current_seq_hash: SequenceHash,
559        parent_seq_hash: Option<SequenceHash>,
560        position: u64,
561    ) -> Self {
562        if position >= (1u64 << 24) {
563            panic!(
564                "Position {} exceeds maximum supported value (2^24 - 1 = 16,777,215)",
565                position
566            );
567        }
568
569        let mode = Self::select_mode(position);
570        let (position_bits, parent_bits) = Self::bit_layout(mode);
571
572        let position_mask = (1u128 << position_bits) - 1;
573        let parent_mask = (1u128 << parent_bits) - 1;
574
575        let position_part = (position as u128) & position_mask;
576        let current_part = current_seq_hash as u128;
577        let parent_part = (parent_seq_hash.unwrap_or(0) as u128) & parent_mask;
578
579        // Pack: [mode (2)][position (P)][current_u64 (64)][parent_fragment (R)]
580        let value = ((mode as u128) << 126)
581            | (position_part << (64 + parent_bits))
582            | (current_part << parent_bits)
583            | parent_part;
584
585        PositionalLineageHash(value)
586    }
587
588    /// Creates a root [`PositionalLineageHash`] (position 0).
589    ///
590    /// At the root, the sequence hash equals the block hash and there is no parent.
591    pub fn root(block_hash: BlockHash) -> Self {
592        Self::new(block_hash, None, 0)
593    }
594
595    /// Extends this lineage by one block, producing the child PLH.
596    ///
597    /// The chain recurrence is [`compute_next_sequence_hash`]. Salt does not seed the
598    /// per-step xxh3: `BlockHash` is already `xxh3(tokens, salt_hash)`, so salt is mixed
599    /// into `seq_hash[0]` and propagates through every parent. Re-feeding it at each step
600    /// would be redundant.
601    ///
602    /// # Panics
603    ///
604    /// Panics if `self.position() + 1 >= 2^24`.
605    pub fn extend(&self, child_block_hash: BlockHash) -> Self {
606        let parent_seq = self.current_sequence_hash();
607        let child_seq = compute_next_sequence_hash(parent_seq, child_block_hash);
608        Self::new(child_seq, Some(parent_seq), self.position() + 1)
609    }
610
611    /// Returns the block position.
612    pub fn position(&self) -> u64 {
613        let mode = self.mode();
614        let (position_bits, parent_bits) = Self::bit_layout(mode);
615        let position_mask = (1u128 << position_bits) - 1;
616        ((self.0 >> (64 + parent_bits)) & position_mask) as u64
617    }
618
619    /// Returns the full 64-bit sequence hash of the current block.
620    ///
621    /// Unlike the legacy `current_hash_fragment` (now removed), this is the complete
622    /// `SequenceHash` and is what [`extend`](Self::extend) feeds into the chain.
623    pub fn current_sequence_hash(&self) -> SequenceHash {
624        let mode = self.mode();
625        let (_, parent_bits) = Self::bit_layout(mode);
626        ((self.0 >> parent_bits) & 0xFFFF_FFFF_FFFF_FFFFu128) as u64
627    }
628
629    /// Returns the parent sequence hash fragment as stored in this PLH.
630    ///
631    /// Width depends on this PLH's mode (54/46/38 bits). Backward radix lookup must
632    /// always combine `(position, parent_fragment)` — the fragment alone is not unique.
633    pub fn parent_hash_fragment(&self) -> u64 {
634        let mode = self.mode();
635        let (_, parent_bits) = Self::bit_layout(mode);
636        let parent_mask = (1u128 << parent_bits) - 1;
637        (self.0 & parent_mask) as u64
638    }
639
640    /// Truncates this PLH's `current_sequence_hash` to the parent-fragment width that
641    /// a child at `child_position` would store.
642    ///
643    /// Useful when an external builder wants to construct a child PLH or verify a
644    /// parent/child edge: `child.parent_hash_fragment() ==
645    /// parent.parent_fragment_for_child_position(child.position())`.
646    pub fn parent_fragment_for_child_position(&self, child_position: u64) -> u64 {
647        let child_mode = Self::select_mode(child_position);
648        let (_, child_parent_bits) = Self::bit_layout(child_mode);
649        let mask = (1u64 << child_parent_bits).wrapping_sub(1);
650        self.current_sequence_hash() & mask
651    }
652
653    /// Returns the mode used for encoding (0, 1, or 2).
654    pub fn mode(&self) -> u8 {
655        (self.0 >> 126) as u8
656    }
657
658    /// Returns the inner 128-bit value.
659    #[inline(always)]
660    pub fn as_u128(&self) -> u128 {
661        self.0
662    }
663
664    /// Selects the minimal mode that can represent the given position.
665    fn select_mode(position: u64) -> u8 {
666        if position < (1u64 << 8) {
667            0 // Mode 00: 8-bit position
668        } else if position < (1u64 << 16) {
669            1 // Mode 01: 16-bit position
670        } else {
671            2 // Mode 10: 24-bit position
672        }
673    }
674
675    /// Returns the bit layout for a given mode: (position_bits, parent_fragment_bits).
676    /// Current is always 64 bits.
677    fn bit_layout(mode: u8) -> (u32, u32) {
678        match mode {
679            0 => (8, 54),  // 2 + 8 + 64 + 54 = 128
680            1 => (16, 46), // 2 + 16 + 64 + 46 = 128
681            2 => (24, 38), // 2 + 24 + 64 + 38 = 128
682            _ => unreachable!(
683                "Invalid mode {} in PositionalLineageHash; mode must be 0, 1, or 2",
684                mode
685            ),
686        }
687    }
688}
689
690impl PositionalLineageHash {
691    fn format_impl(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
692        let position = self.position();
693        let current_hash = self.current_sequence_hash();
694        let current_hash_b58 = bs58::encode(current_hash.to_be_bytes()).into_string();
695
696        if position == 0 {
697            write!(f, "{}:{}", position, current_hash_b58)
698        } else {
699            let parent_hash = self.parent_hash_fragment();
700            let parent_hash_b58 = bs58::encode(parent_hash.to_be_bytes()).into_string();
701            write!(f, "{}:{}:{}", position, current_hash_b58, parent_hash_b58)
702        }
703    }
704}
705
706impl std::fmt::Debug for PositionalLineageHash {
707    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
708        self.format_impl(f)
709    }
710}
711
712impl std::fmt::Display for PositionalLineageHash {
713    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
714        self.format_impl(f)
715    }
716}
717
718impl std::cmp::PartialOrd for PositionalLineageHash {
719    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
720        Some(self.cmp(other))
721    }
722}
723
724impl std::cmp::Ord for PositionalLineageHash {
725    /// Lexicographic order: [`Self::position`], then [`Self::current_sequence_hash`],
726    /// then the full packed [`Self::as_u128`] so the order is total and consistent with [`Eq`].
727    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
728        self.position()
729            .cmp(&other.position())
730            .then_with(|| {
731                self.current_sequence_hash()
732                    .cmp(&other.current_sequence_hash())
733            })
734            .then_with(|| self.0.cmp(&other.0))
735    }
736}
737
738/// A collection of tokens, represented as a `Vec<Token>`.
739///
740/// Provides convenience methods for conversion and manipulation.
741#[derive(Debug, Clone, Dissolve, Default, Eq)]
742pub struct Tokens(Vec<Token>);
743
744impl AsRef<[Token]> for Tokens {
745    fn as_ref(&self) -> &[Token] {
746        &self.0
747    }
748}
749
750impl std::ops::Deref for Tokens {
751    type Target = [Token];
752
753    fn deref(&self) -> &Self::Target {
754        &self.0
755    }
756}
757
758impl std::borrow::Borrow<[Token]> for Tokens {
759    fn borrow(&self) -> &[Token] {
760        &self.0
761    }
762}
763
764impl From<Vec<Token>> for Tokens {
765    fn from(tokens: Vec<Token>) -> Self {
766        Tokens(tokens)
767    }
768}
769
770impl From<&[Token]> for Tokens {
771    fn from(tokens: &[Token]) -> Self {
772        Tokens(tokens.to_vec())
773    }
774}
775
776impl From<Vec<usize>> for Tokens {
777    fn from(tokens: Vec<usize>) -> Self {
778        Tokens(
779            tokens
780                .into_iter()
781                .map(|t| t.try_into().expect("Token ID exceeds u32::MAX"))
782                .collect(),
783        )
784    }
785}
786
787impl From<Vec<i32>> for Tokens {
788    /// Converts `Vec<i32>` to `Tokens`, casting each `i32` to `u32`.
789    fn from(tokens: Vec<i32>) -> Self {
790        Tokens(tokens.into_iter().map(|t| t as u32).collect())
791    }
792}
793
794impl From<&[i32]> for Tokens {
795    /// Converts `&[i32]` to `Tokens`, casting each `i32` to `u32`.
796    fn from(tokens: &[i32]) -> Self {
797        Tokens(tokens.iter().map(|&t| t as u32).collect())
798    }
799}
800
801impl From<Tokens> for Vec<Token> {
802    fn from(tokens: Tokens) -> Self {
803        tokens.0
804    }
805}
806
807// PartialEq implementations for comparing Tokens with Vec<Token> and &[Token]
808// (Generated implementations are usually sufficient, but explicit ones can be clearer)
809impl PartialEq<Vec<Token>> for Tokens {
810    fn eq(&self, other: &Vec<Token>) -> bool {
811        self.0 == *other
812    }
813}
814
815impl PartialEq<Tokens> for Vec<Token> {
816    fn eq(&self, other: &Tokens) -> bool {
817        *self == other.0
818    }
819}
820
821impl PartialEq<[Token]> for Tokens {
822    fn eq(&self, other: &[Token]) -> bool {
823        self.0.as_slice() == other
824    }
825}
826
827impl PartialEq<Tokens> for &[Token] {
828    fn eq(&self, other: &Tokens) -> bool {
829        *self == other.0.as_slice()
830    }
831}
832
833impl PartialEq for Tokens {
834    fn eq(&self, other: &Self) -> bool {
835        self.0 == other.0
836    }
837}
838
839// Add PartialEq<&[T]> where T: Into<Token> + Copy could be more general,
840// but specifically implementing for &[Token] is sufficient for the tests.
841impl PartialEq<&[Token]> for Tokens {
842    fn eq(&self, other: &&[Token]) -> bool {
843        self.0.as_slice() == *other
844    }
845}
846
847impl Tokens {
848    fn with_capacity(capacity: usize) -> Self {
849        Tokens(Vec::with_capacity(capacity))
850    }
851
852    /// Consumes the [`Tokens`] object and creates a [`TokenBlockSequence`].
853    ///
854    /// The sequence is initialized with the provided tokens, splitting them into blocks
855    /// of the specified `block_size` using the given `salt_hash` (or 0 if `None`).
856    ///
857    /// # Arguments
858    ///
859    /// * `block_size` - The fixed size for each [`TokenBlock`].
860    /// * `salt_hash` - An optional [`SaltHash`] used as the base seed for hashing. Defaults to 0.
861    pub fn into_sequence(self, block_size: u32, salt_hash: Option<SaltHash>) -> TokenBlockSequence {
862        TokenBlockSequence::new(self, block_size, salt_hash)
863    }
864}
865
866/// Errors that can occur during [`PartialTokenBlock`] operations.
867#[derive(Debug, Clone, Copy, PartialEq, Eq, thiserror::Error)]
868pub enum TokenBlockError {
869    /// The operation could not be completed because the block is full.
870    #[error("TokenBlock is full")]
871    Full,
872
873    /// The operation requires a full block, but the block is incomplete.
874    #[error("TokenBlock is incomplete")]
875    Incomplete,
876
877    /// The operation could not be completed because the block is empty.
878    #[error("TokenBlock is empty")]
879    Empty,
880
881    /// The operation requires more tokens than are currently in the block.
882    #[error("TokenBlock has insufficient tokens")]
883    InsufficientTokens,
884
885    /// Multimodal info validation failed.
886    #[error(transparent)]
887    MmInfo(#[from] MmInfoError),
888
889    /// A mutating operation is not supported on a sequence with multimodal runs.
890    #[error("operation is not supported on a TokenBlockSequence with multimodal runs")]
891    MmRunsPresent,
892}
893
894/// Represents a partially filled block of tokens within a sequence.
895///
896/// This structure accumulates tokens until it reaches the specified `block_size`,
897/// at which point it can be [`commit`](PartialTokenBlock::commit)ted into a full [`TokenBlock`].
898#[derive(Debug, PartialEq)] // No Clone: intended to be unique within a sequence
899pub struct PartialTokenBlock {
900    tokens: Tokens,
901    block_size: u32,
902    salt_hash: SaltHash,
903    parent_sequence_hash: Option<SequenceHash>,
904    position: usize, // The position this block will have when committed
905}
906
907impl PartialTokenBlock {
908    /// Creates the first partial block (root) for a new sequence.
909    ///
910    /// # Arguments
911    ///
912    /// * `block_size` - The fixed size for blocks in this sequence.
913    /// * `salt_hash` - The [`SaltHash`] for the sequence.
914    pub(crate) fn create_sequence_root(block_size: u32, salt_hash: SaltHash) -> Self {
915        Self {
916            tokens: Tokens::with_capacity(block_size as usize),
917            block_size,
918            salt_hash,
919            parent_sequence_hash: None, // Root has no parent
920            position: 0,                // First block is at position 0
921        }
922    }
923
924    /// Attempts to push multiple tokens onto the block from a [`Tokens`] object.
925    ///
926    /// Tokens are added until the block is full or all input tokens are consumed.
927    ///
928    /// # Arguments
929    ///
930    /// * `tokens` - The [`Tokens`] to push.
931    ///
932    /// # Returns
933    ///
934    /// A new [`Tokens`] object containing any tokens that did not fit,
935    /// if all tokens were added, the returned object will be empty.
936    pub(crate) fn push_tokens(&mut self, tokens: Tokens) -> Tokens {
937        let remaining_space = self.remaining();
938
939        if remaining_space == 0 {
940            return tokens; // Block is already full
941        }
942
943        if tokens.0.len() <= remaining_space {
944            // All tokens fit
945            self.tokens.0.extend(tokens.0);
946            Tokens::default() // No remaining tokens
947        } else {
948            // Only some tokens fit
949            let (to_add, remaining) = tokens.0.split_at(remaining_space);
950            self.tokens.0.extend_from_slice(to_add);
951            Tokens(remaining.to_vec()) // Return the leftover tokens
952        }
953    }
954
955    /// Attempts to push a single token onto the block.
956    ///
957    /// # Returns
958    ///
959    /// * `Ok(())` - If the token was successfully added.
960    /// * `Err(TokenBlockError::Full)` - If the block already contains `block_size` tokens.
961    #[cfg(test)]
962    pub(crate) fn push_token(&mut self, token: Token) -> Result<(), TokenBlockError> {
963        if self.tokens.0.len() >= self.block_size as usize {
964            return Err(TokenBlockError::Full);
965        }
966        self.tokens.0.push(token);
967        Ok(())
968    }
969
970    /// Attempts to remove the last `count` tokens from the block.
971    ///
972    /// # Arguments
973    ///
974    /// * `count` - The number of tokens to remove.
975    ///
976    /// # Returns
977    ///
978    /// * `Ok(())` - If the specified number of tokens were successfully removed.
979    /// * `Err(TokenBlockError::InsufficientTokens)` - If `count` is greater than the number of tokens in the block.
980    pub(crate) fn pop_tokens(&mut self, count: usize) -> Result<(), TokenBlockError> {
981        if self.tokens.0.len() < count {
982            return Err(TokenBlockError::InsufficientTokens);
983        }
984        self.tokens.0.truncate(self.tokens.0.len() - count);
985        Ok(())
986    }
987
988    /// Attempts to commit the current partial block into a full [`TokenBlock`].
989    ///
990    /// This operation consumes the tokens within the partial block.
991    /// After a successful commit, this `PartialTokenBlock` instance is reset
992    /// to represent the *next* partial block in the sequence, inheriting the
993    /// sequence hash from the block just committed.
994    ///
995    /// # Returns
996    ///
997    /// * `Ok(TokenBlock)` - The newly created full [`TokenBlock`].
998    /// * `Err(TokenBlockError::Incomplete)` - If the block does not contain exactly `block_size` tokens.
999    pub fn commit(&mut self) -> Result<TokenBlock, TokenBlockError> {
1000        if self.tokens.0.len() != self.block_size as usize {
1001            // Check for exact size match for committing
1002            return Err(TokenBlockError::Incomplete);
1003        }
1004
1005        // Take ownership of the tokens, leaving the internal tokens empty
1006        let tokens = std::mem::replace(
1007            &mut self.tokens,
1008            Tokens::with_capacity(self.block_size as usize),
1009        );
1010
1011        let chunk = TokenBlockChunk::new(tokens, self.salt_hash);
1012        let block = TokenBlock::from_chunk(chunk, self.parent_sequence_hash, self.position);
1013
1014        // Reset self to be the next block in the sequence
1015        self.parent_sequence_hash = Some(block.sequence_hash());
1016        self.position += 1; // Increment position for the next block
1017        // self.block_size and self.salt_hash remain the same
1018
1019        Ok(block)
1020    }
1021
1022    /// Returns the number of additional tokens required to fill the block.
1023    pub fn remaining(&self) -> usize {
1024        // Use saturating_sub to prevent underflow if len somehow exceeds block_size
1025        (self.block_size as usize).saturating_sub(self.tokens.0.len())
1026    }
1027
1028    /// Returns the number of tokens currently in the block.
1029    pub fn len(&self) -> usize {
1030        self.tokens.0.len()
1031    }
1032
1033    /// Returns `true` if the block contains no tokens.
1034    pub fn is_empty(&self) -> bool {
1035        self.tokens.0.is_empty()
1036    }
1037
1038    /// Returns a reference to the tokens currently in the block.
1039    pub fn tokens(&self) -> &Tokens {
1040        &self.tokens
1041    }
1042}
1043
1044// Deref allows treating &PartialTokenBlock like &Tokens for read-only access.
1045impl std::ops::Deref for PartialTokenBlock {
1046    type Target = Tokens;
1047
1048    fn deref(&self) -> &Self::Target {
1049        &self.tokens
1050    }
1051}
1052
1053/// An intermediate structure holding a chunk of tokens destined to become a [`TokenBlock`].
1054///
1055/// This calculates the [`BlockHash`] but does not compute the final [`SequenceHash`],
1056/// allowing chunks to be processed independently (e.g., in parallel).
1057#[derive(Debug)] // No Clone: temporary intermediate value
1058struct TokenBlockChunk {
1059    tokens: Tokens,
1060    salt_hash: SaltHash,
1061    block_hash: BlockHash,
1062}
1063
1064impl TokenBlockChunk {
1065    /// Creates a new chunk from [`Tokens`], calculating the [`BlockHash`].
1066    fn new(tokens: Tokens, salt_hash: SaltHash) -> Self {
1067        let block_hash = compute_block_hash(cast_slice(&tokens), salt_hash);
1068        Self {
1069            tokens,
1070            salt_hash,
1071            block_hash,
1072        }
1073    }
1074
1075    /// Creates a new chunk from a slice of `&[Token]`, calculating the [`BlockHash`].
1076    fn from_tokens(tokens: &[Token], salt_hash: SaltHash) -> Self {
1077        let block_hash = compute_block_hash(cast_slice(tokens), salt_hash);
1078        Self {
1079            tokens: tokens.into(), // Converts slice to owned Tokens
1080            salt_hash,
1081            block_hash,
1082        }
1083    }
1084}
1085
1086/// Represents a completed, immutable block of tokens with associated hashes.
1087///
1088/// Contains exactly `block_size` tokens and includes the [`SaltHash`], [`BlockHash`],
1089/// [`SequenceHash`], [`PositionalSequenceHash`], [`PositionalLineageHash`], and optionally the parent's [`SequenceHash`].
1090#[derive(Debug, Clone, Default, PartialEq)] // Add PartialEq for tests
1091pub struct TokenBlock {
1092    tokens: Tokens,
1093    salt_hash: SaltHash,
1094    block_hash: BlockHash,
1095    sequence_hash: SequenceHash,
1096    parent_sequence_hash: Option<SequenceHash>,
1097    positional_sequence_hash: PositionalSequenceHash,
1098    positional_lineage_hash: PositionalLineageHash,
1099}
1100
1101impl TokenBlock {
1102    /// Creates a new [`PartialTokenBlock`] representing the block immediately following this one.
1103    ///
1104    /// The new partial block will have the correct `parent_sequence_hash` and `position` set.
1105    pub fn next_block(&self) -> PartialTokenBlock {
1106        PartialTokenBlock {
1107            tokens: Tokens::with_capacity(self.tokens.len()),
1108            block_size: self.tokens.len() as u32, // Should be == self.block_size
1109            salt_hash: self.salt_hash,
1110            parent_sequence_hash: Some(self.sequence_hash), // Link to this block
1111            position: self.position() as usize + 1,         // Next position
1112        }
1113    }
1114
1115    /// Finalizes a [`TokenBlock`] from a [`TokenBlockChunk`], parent's sequence hash, and position.
1116    ///
1117    /// This computes the final [`SequenceHash`], [`PositionalSequenceHash`], and [`PositionalLineageHash`] for the block.
1118    fn from_chunk(
1119        chunk: TokenBlockChunk,
1120        parent_sequence_hash: Option<SequenceHash>,
1121        position: usize,
1122    ) -> Self {
1123        let sequence_hash = match parent_sequence_hash {
1124            Some(parent) => compute_next_sequence_hash(parent, chunk.block_hash),
1125            None => {
1126                // First block: sequence hash is just the block hash
1127                chunk.block_hash
1128            }
1129        };
1130
1131        let positional_sequence_hash = PositionalSequenceHash::new(
1132            sequence_hash,
1133            position as u64,
1134            chunk.block_hash, // LocalBlockHash is the same as BlockHash
1135        );
1136
1137        let positional_lineage_hash =
1138            PositionalLineageHash::new(sequence_hash, parent_sequence_hash, position as u64);
1139
1140        Self {
1141            tokens: chunk.tokens,
1142            salt_hash: chunk.salt_hash,
1143            block_hash: chunk.block_hash,
1144            sequence_hash,
1145            parent_sequence_hash,
1146            positional_sequence_hash,
1147            positional_lineage_hash,
1148        }
1149    }
1150
1151    /// Returns a reference to the tokens in this block.
1152    pub fn tokens(&self) -> &Tokens {
1153        &self.tokens
1154    }
1155
1156    /// Returns the salt hash used for this block's hashing.
1157    pub fn salt_hash(&self) -> SaltHash {
1158        self.salt_hash
1159    }
1160
1161    /// Returns the hash of only the tokens within this block.
1162    pub fn block_hash(&self) -> BlockHash {
1163        self.block_hash
1164    }
1165
1166    /// Returns the sequence-aware hash for this block.
1167    pub fn sequence_hash(&self) -> SequenceHash {
1168        self.sequence_hash
1169    }
1170
1171    /// Returns the sequence hash of the preceding block, if any.
1172    pub fn parent_sequence_hash(&self) -> Option<SequenceHash> {
1173        self.parent_sequence_hash
1174    }
1175
1176    /// Returns the number of tokens in the block.
1177    pub fn block_size(&self) -> usize {
1178        self.tokens.0.len()
1179    }
1180
1181    /// Returns the positional sequence hash for this block.
1182    pub fn positional_sequence_hash(&self) -> PositionalSequenceHash {
1183        self.positional_sequence_hash
1184    }
1185
1186    /// Returns the positional lineage hash for this block.
1187    pub fn positional_lineage_hash(&self) -> PositionalLineageHash {
1188        self.positional_lineage_hash
1189    }
1190
1191    /// Returns the position of this block in the sequence.
1192    pub fn position(&self) -> u64 {
1193        self.positional_sequence_hash.position()
1194    }
1195}
1196
1197impl PositionalHash for PositionalSequenceHash {
1198    fn position(&self) -> u64 {
1199        self.position()
1200    }
1201}
1202
1203impl PositionalHash for PositionalLineageHash {
1204    fn position(&self) -> u64 {
1205        self.position()
1206    }
1207}
1208
1209/// Represents a sequence of tokens, segmented into fixed-size, hashed blocks.
1210///
1211/// This structure manages a series of completed [`TokenBlock`]s and one
1212/// [`PartialTokenBlock`] for accumulating incoming tokens.
1213/// It provides methods for appending tokens (`append`, `extend`), removing tokens
1214/// (`pop`, `truncate`, `unwind`), and accessing sequence information.
1215///
1216/// Hashing incorporates an initial [`SaltHash`] to ensure uniqueness across different
1217/// contexts (e.g., different models, PEFTs).
1218///
1219/// Key Hashes:
1220/// - [`BlockHash`]: Hash of tokens within a single block (seeded by [`SaltHash`]).
1221/// - [`SequenceHash`]: Hash combining the previous block's [`SequenceHash`] and the current
1222///   block's [`BlockHash`] (also seeded by [`SaltHash`]).
1223#[derive(Debug, PartialEq)]
1224pub struct TokenBlockSequence {
1225    blocks: Vec<TokenBlock>,
1226    current_block: PartialTokenBlock,
1227    salt_hash: SaltHash,
1228    block_size: usize,
1229    /// Validated, sorted multimodal runs covering committed and partial slots.
1230    /// Empty for sequences built via the zero-MM constructors; populated by
1231    /// [`TokenBlockSequence::new_with_mm`] and the streaming MM helpers.
1232    mm_runs: Vec<TokenBlockMmInfo>,
1233}
1234
1235impl TokenBlockSequence {
1236    /// Creates a new [`TokenBlockSequence`] from an initial set of tokens.
1237    ///
1238    /// The tokens are split into blocks of `block_size`. Any remaining tokens
1239    /// form the initial `current_block`.
1240    ///
1241    /// # Arguments
1242    ///
1243    /// * `tokens` - The initial [`Tokens`] for the sequence.
1244    /// * `block_size` - The fixed size for each [`TokenBlock`]. Must be greater than 0.
1245    /// * `salt_hash` - An optional [`SaltHash`]. Defaults to 0 if `None`.
1246    ///
1247    /// # Panics
1248    ///
1249    /// Panics if `block_size` is 0.
1250    pub fn new(tokens: Tokens, block_size: u32, salt_hash: Option<SaltHash>) -> Self {
1251        assert!(block_size > 0, "block_size must be greater than 0");
1252        let salt_hash = salt_hash.unwrap_or_default();
1253        let (blocks, current_block) = Self::split_tokens(&tokens, block_size, salt_hash);
1254
1255        Self {
1256            blocks,
1257            current_block,
1258            salt_hash,
1259            block_size: block_size as usize,
1260            mm_runs: Vec::new(),
1261        }
1262    }
1263
1264    /// Extends the sequence with the given tokens, potentially completing multiple blocks.
1265    ///
1266    /// This method processes all tokens from the input [`Tokens`] object.
1267    /// If adding tokens causes one or more blocks to become full, they are committed
1268    /// and added to the internal list of completed blocks.
1269    ///
1270    /// # Arguments
1271    ///
1272    /// * `tokens` - The [`Tokens`] object containing the tokens to extend the sequence with.
1273    ///
1274    /// # Returns
1275    ///
1276    /// * `Ok(Some(Range<usize>))` - The range of indices in the `blocks` vector corresponding
1277    ///   to the blocks completed during this `extend` operation.
1278    /// * `Ok(None)` - If no blocks were completed.
1279    /// * `Err(TokenBlockError)` - If an internal error occurs during commit.
1280    pub fn extend(&mut self, tokens: Tokens) -> Result<Option<Range<usize>>, TokenBlockError> {
1281        let start_block_index = self.blocks.len();
1282        let mut tokens_to_append = tokens;
1283
1284        while !tokens_to_append.is_empty() {
1285            let remaining_in_current = self.current_block.remaining();
1286
1287            if remaining_in_current == 0 {
1288                // Current block is full, commit it first.
1289                let new_block = self.commit_current()?;
1290                self.blocks.push(new_block);
1291                // Continue loop to add tokens to the *new* current_block.
1292            }
1293
1294            // Push as many tokens as possible into the current (potentially new) block.
1295            let available_tokens = tokens_to_append;
1296            tokens_to_append = self.current_block.push_tokens(available_tokens);
1297
1298            // Check if the current block *became* full after pushing tokens.
1299            if self.current_block.remaining() == 0 {
1300                // If it became full AND there are still more tokens to append,
1301                // commit it now so the next loop iteration starts with a fresh block.
1302                let new_block = self.commit_current()?;
1303                self.blocks.push(new_block);
1304            }
1305        }
1306
1307        let end_block_index = self.blocks.len();
1308        if start_block_index == end_block_index {
1309            Ok(None) // No blocks were completed.
1310        } else {
1311            Ok(Some(start_block_index..end_block_index))
1312        }
1313    }
1314
1315    /// Commits the current partial block.
1316    ///
1317    /// Routes through the MM-aware byte encoding when [`Self::mm_runs`] is non-empty;
1318    /// otherwise behaves identically to [`PartialTokenBlock::commit`].
1319    fn commit_current(&mut self) -> Result<TokenBlock, TokenBlockError> {
1320        if self.mm_runs.is_empty() {
1321            return self.current_block.commit();
1322        }
1323        // MM-aware path: compute block_hash from the substituted byte buffer.
1324        if self.current_block.tokens.0.len() != self.current_block.block_size as usize {
1325            return Err(TokenBlockError::Incomplete);
1326        }
1327        let block_offset = self.blocks.len() * (self.current_block.block_size as usize);
1328        let tokens = std::mem::take(&mut self.current_block.tokens);
1329        let block_bytes = compute_block_bytes_with_mm(&tokens, block_offset, &self.mm_runs);
1330        let block_hash = compute_block_hash(&block_bytes, self.current_block.salt_hash);
1331        let chunk = TokenBlockChunk {
1332            tokens,
1333            salt_hash: self.current_block.salt_hash,
1334            block_hash,
1335        };
1336        let block = TokenBlock::from_chunk(
1337            chunk,
1338            self.current_block.parent_sequence_hash,
1339            self.current_block.position,
1340        );
1341        self.current_block.parent_sequence_hash = Some(block.sequence_hash());
1342        self.current_block.position += 1;
1343        Ok(block)
1344    }
1345
1346    /// Appends a single token to the sequence.
1347    ///
1348    /// If adding this token completes the current partial block, the block is committed,
1349    /// and the index of the newly completed block is returned.
1350    ///
1351    /// This method is equivalent to calling [`extend`] with a single-token [`Tokens`] object.
1352    ///
1353    /// # Arguments
1354    ///
1355    /// * `token` - The [`Token`] to append.
1356    ///
1357    /// # Returns
1358    ///
1359    /// * `Ok(Some(usize))` - The index of the block that was just completed.
1360    /// * `Ok(None)` - No block was completed by adding this token.
1361    /// * `Err(TokenBlockError)` - If an internal error occurs during processing.
1362    pub fn append(&mut self, token: Token) -> Result<Option<usize>, TokenBlockError> {
1363        let before = self.blocks.len();
1364        self.extend(Tokens::from(vec![token]))?;
1365        Ok(if self.blocks.len() > before {
1366            Some(before)
1367        } else {
1368            None
1369        })
1370    }
1371
1372    /// Shortens the sequence, keeping the first `len` tokens and removing the rest.
1373    ///
1374    /// If `len` is greater than the sequence's current length, this has no effect.
1375    ///
1376    /// This operation is analogous to `Vec::truncate`.
1377    /// It may involve removing tokens from the current partial block, removing entire
1378    /// completed blocks, and adjusting the current partial block
1379    /// to reflect the new end of the sequence.
1380    ///
1381    /// # Arguments
1382    ///
1383    /// * `len` - The number of tokens to keep.
1384    ///
1385    /// # Returns
1386    ///
1387    /// * `Ok(())` - If the sequence was successfully truncated.
1388    /// * `Err(TokenBlockError::InsufficientTokens)` - This error should ideally not occur if `len`
1389    ///   is correctly checked against `total_tokens`, but the underlying `pop_tokens` might return it.
1390    pub fn truncate(&mut self, len: usize) -> Result<(), TokenBlockError> {
1391        if !self.mm_runs.is_empty() {
1392            return Err(TokenBlockError::MmRunsPresent);
1393        }
1394        let current_total_len = self.total_tokens();
1395        if len >= current_total_len {
1396            return Ok(()); // Nothing to truncate
1397        }
1398
1399        let n = current_total_len - len; // Number of tokens to remove
1400
1401        // This inner block handles the actual removal logic based on `n` tokens to remove.
1402        {
1403            let current_len = self.current_block.len();
1404            // Avoid division by zero if block_size is somehow 0 (though asserted in new)
1405            let block_size = self.current_block.block_size.max(1);
1406
1407            if n <= current_len {
1408                // Only need to pop from the current partial block
1409                self.current_block.pop_tokens(n)?;
1410            } else {
1411                // Need to pop from full blocks as well
1412                let tokens_to_pop_from_blocks = n - current_len;
1413
1414                // Calculate how many blocks are affected (including the one partially popped)
1415                let num_blocks_to_affect = tokens_to_pop_from_blocks.div_ceil(block_size as usize);
1416
1417                // Check if we need to pop more blocks than available (should be prevented by initial len check)
1418                if num_blocks_to_affect > self.blocks.len() {
1419                    // This indicates an inconsistency between total_tokens() and internal state.
1420                    debug_assert!(
1421                        false,
1422                        "Truncate calculation error: trying to pop too many blocks."
1423                    );
1424                    return Err(TokenBlockError::InsufficientTokens);
1425                }
1426
1427                // Determine the index of the block that will be the source for the new partial block
1428                let source_block_index = self.blocks.len() - num_blocks_to_affect;
1429
1430                // Calculate how many tokens to keep from that source block
1431                let num_full_blocks_completely_popped = num_blocks_to_affect - 1;
1432                let num_tokens_to_pop_from_source_block = tokens_to_pop_from_blocks
1433                    - num_full_blocks_completely_popped * block_size as usize;
1434                let num_tokens_to_keep_in_new_partial =
1435                    (block_size as usize).saturating_sub(num_tokens_to_pop_from_source_block);
1436
1437                // Get the tokens for the new partial block
1438                let new_partial_tokens = if num_tokens_to_keep_in_new_partial > 0 {
1439                    self.blocks[source_block_index].tokens().as_ref()
1440                        [..num_tokens_to_keep_in_new_partial]
1441                        .to_vec()
1442                } else {
1443                    Vec::new()
1444                };
1445
1446                // Truncate the blocks vector to remove popped blocks
1447                self.blocks.truncate(source_block_index);
1448
1449                // Update the current_block state
1450                self.current_block.tokens = Tokens(new_partial_tokens);
1451                // Correctly set the parent hash based on the *new* last block
1452                self.current_block.parent_sequence_hash =
1453                    self.blocks.last().map(|b| b.sequence_hash());
1454                // Update position to match the number of complete blocks
1455                self.current_block.position = self.blocks.len();
1456                // salt_hash and block_size remain the same for current_block
1457            }
1458        }
1459        Ok(())
1460    }
1461
1462    /// Removes the last `count` tokens from the sequence.
1463    ///
1464    /// This is a convenience method that calculates the required length and calls [`truncate`].
1465    ///
1466    /// # Arguments
1467    ///
1468    /// * `count` - The number of tokens to remove from the end.
1469    ///
1470    /// # Returns
1471    ///
1472    /// * `Ok(())` - If the tokens were successfully removed.
1473    /// * `Err(TokenBlockError::InsufficientTokens)` - If `count` is greater than or equal to
1474    ///   the total number of tokens in the sequence.
1475    pub fn unwind(&mut self, count: usize) -> Result<(), TokenBlockError> {
1476        let current_total_len = self.total_tokens();
1477        if count > current_total_len {
1478            // Allow count == current_total_len, which truncates to 0.
1479            return Err(TokenBlockError::InsufficientTokens);
1480        }
1481
1482        // number of tokens remaining in the sequence after undoing the given count
1483        let len = current_total_len - count;
1484        self.truncate(len)
1485    }
1486
1487    /// Resets the sequence to the initial state.
1488    ///
1489    /// Clears any accumulated multimodal runs; after `reset` the sequence behaves
1490    /// identically to a freshly-constructed zero-MM sequence with the same `salt_hash`
1491    /// and `block_size`.
1492    pub fn reset(&mut self) {
1493        self.blocks.clear();
1494        self.current_block =
1495            PartialTokenBlock::create_sequence_root(self.block_size as u32, self.salt_hash);
1496        self.mm_runs.clear();
1497    }
1498
1499    /// Removes the last token from the sequence and returns it, or [`None`] if it is empty.
1500    ///
1501    /// This operation is analogous to `Vec::pop`.
1502    ///
1503    /// # Returns
1504    ///
1505    /// * `Some(Token)` - The last token, if the sequence was not empty.
1506    /// * `None` - If the sequence was empty.
1507    ///
1508    /// # Panics
1509    ///
1510    /// Panics if the sequence has accumulated multimodal runs (see [`Self::try_pop`] for a
1511    /// non-panicking variant). `pop` returns `Option<Token>` and cannot signal
1512    /// "operation unsupported on MM sequence" through its return type without breaking the
1513    /// `Vec::pop` analogy and silently lying to callers.
1514    pub fn pop(&mut self) -> Option<Token> {
1515        if !self.mm_runs.is_empty() {
1516            panic!(
1517                "TokenBlockSequence::pop is not supported on a sequence with multimodal runs; \
1518                 use try_pop or reset before pop"
1519            );
1520        }
1521        let current_total_len = self.total_tokens();
1522        if current_total_len == 0 {
1523            return None;
1524        }
1525
1526        // Determine the last token. It must be in the current_block if current_block is not empty.
1527        // If current_block is empty, it must be the last token of the last full block.
1528        let last_token = if !self.current_block.tokens.is_empty() {
1529            // Last token is in the partial block
1530            *self
1531                .current_block
1532                .tokens
1533                .last()
1534                .expect("Current block checked for non-empty")
1535        } else {
1536            // Current block is empty, sequence is not. Must be in the last full block.
1537            let last_block = self
1538                .blocks
1539                .last()
1540                .expect("Sequence is not empty but has no blocks and empty current block?");
1541            *last_block
1542                .tokens()
1543                .last()
1544                .expect("Last block cannot be empty")
1545        };
1546
1547        // Truncate the sequence by one element.
1548        // We expect this to succeed since we know the length > 0.
1549        match self.truncate(current_total_len - 1) {
1550            Ok(_) => Some(last_token),
1551            Err(_) => {
1552                // This should be logically impossible if total_tokens() and truncate() are correct.
1553                // Panic in debug, return None in release as a fallback, though it indicates a bug.
1554                debug_assert!(
1555                    false,
1556                    "truncate failed unexpectedly after checking length in pop"
1557                );
1558                None
1559            }
1560        }
1561    }
1562
1563    /// Non-panicking variant of [`Self::pop`].
1564    ///
1565    /// Returns:
1566    /// - `Ok(Some(token))` when the sequence had at least one token and pop succeeded.
1567    /// - `Ok(None)` when the sequence was empty.
1568    /// - `Err(TokenBlockError::MmRunsPresent)` when the sequence has multimodal runs.
1569    pub fn try_pop(&mut self) -> Result<Option<Token>, TokenBlockError> {
1570        if !self.mm_runs.is_empty() {
1571            return Err(TokenBlockError::MmRunsPresent);
1572        }
1573        Ok(self.pop())
1574    }
1575
1576    /// Returns a slice containing all the completed [`TokenBlock`]s in the sequence.
1577    pub fn blocks(&self) -> &[TokenBlock] {
1578        &self.blocks
1579    }
1580
1581    /// Returns a reference to the last completed [`TokenBlock`] in the sequence, if any.
1582    pub fn last_complete_block(&self) -> Option<&TokenBlock> {
1583        self.blocks.last()
1584    }
1585
1586    /// Returns a reference to the current [`PartialTokenBlock`] where new tokens are added.
1587    pub fn current_block(&self) -> &PartialTokenBlock {
1588        &self.current_block
1589    }
1590
1591    /// Consumes the sequence and returns its parts: a `Vec` of completed blocks and the final partial block.
1592    pub fn into_parts(self) -> (Vec<TokenBlock>, PartialTokenBlock) {
1593        (self.blocks, self.current_block)
1594    }
1595
1596    /// Returns the block size used for this sequence.
1597    pub fn block_size(&self) -> usize {
1598        self.block_size
1599    }
1600
1601    /// Returns the [`SaltHash`] used for this sequence.
1602    pub fn salt_hash(&self) -> SaltHash {
1603        self.salt_hash
1604    }
1605
1606    /// Returns the total number of tokens in the sequence (sum of tokens in all completed blocks
1607    /// plus tokens in the current partial block).
1608    pub fn total_tokens(&self) -> usize {
1609        let block_size = self.current_block.block_size as usize;
1610        (self.blocks.len() * block_size) + self.current_block.len()
1611    }
1612
1613    /// Extract the token with the range
1614    pub fn tokens_at(&self, range: Range<usize>) -> Tokens {
1615        let total = self.total_tokens();
1616
1617        // Validate range - return empty tokens for invalid ranges
1618        if range.start > range.end || range.end > total {
1619            return Tokens::default();
1620        }
1621
1622        // Handle empty range
1623        if range.is_empty() {
1624            return Tokens::default();
1625        }
1626
1627        let mut result = Vec::with_capacity(range.len());
1628
1629        for i in range {
1630            if i < self.blocks.len() * self.block_size {
1631                // Token is in a completed block
1632                let block_index = i / self.block_size;
1633                let token_index = i % self.block_size;
1634                result.push(self.blocks[block_index].tokens()[token_index]);
1635            } else {
1636                // Token is in the current partial block
1637                let current_block_index = i - (self.blocks.len() * self.block_size);
1638                result.push(self.current_block.tokens()[current_block_index]);
1639            }
1640        }
1641
1642        Tokens::from(result)
1643    }
1644
1645    /// Splits a [`Tokens`] object into a vector of completed blocks and a final partial block.
1646    ///
1647    /// This is primarily used internally by [`TokenBlockSequence::new`] but can be used externally.
1648    ///
1649    /// # Arguments
1650    ///
1651    /// * `tokens` - The [`Tokens`] to split.
1652    /// * `block_size` - The size of each block.
1653    /// * `salt_hash` - The [`SaltHash`] to use for hashing.
1654    ///
1655    /// # Returns
1656    ///
1657    /// A tuple containing `(Vec<TokenBlock>, PartialTokenBlock)`.
1658    ///
1659    /// # Panics
1660    ///
1661    /// Panics if `block_size` is 0.
1662    pub fn split_tokens(
1663        tokens: &[Token],
1664        block_size: u32,
1665        salt_hash: SaltHash,
1666    ) -> (Vec<TokenBlock>, PartialTokenBlock) {
1667        assert!(block_size > 0, "block_size must be greater than 0");
1668        let chunks: Vec<TokenBlockChunk> = tokens
1669            .as_ref()
1670            .chunks_exact(block_size as usize)
1671            .map(|chunk| TokenBlockChunk::from_tokens(chunk, salt_hash))
1672            .collect();
1673
1674        let mut result_blocks = Vec::with_capacity(chunks.len());
1675        let mut last_sequence_hash: Option<SequenceHash> = None;
1676
1677        // Sequentially combine chunks to compute sequence hashes
1678        for (position, chunk) in chunks.into_iter().enumerate() {
1679            let new_block = TokenBlock::from_chunk(chunk, last_sequence_hash, position);
1680            last_sequence_hash = Some(new_block.sequence_hash());
1681            result_blocks.push(new_block);
1682        }
1683
1684        // Handle any remaining tokens
1685        let remainder = tokens
1686            .as_ref()
1687            .chunks_exact(block_size as usize)
1688            .remainder();
1689
1690        let next_position = result_blocks.len(); // Position for the next block to be committed
1691
1692        let mut partial_tokens = Tokens::with_capacity(block_size as usize);
1693        partial_tokens.0.extend_from_slice(remainder);
1694
1695        let current_block = PartialTokenBlock {
1696            tokens: partial_tokens,
1697            block_size,
1698            salt_hash,
1699            // Parent hash is the sequence hash of the last *full* block computed
1700            parent_sequence_hash: last_sequence_hash,
1701            position: next_position,
1702        };
1703
1704        (result_blocks, current_block)
1705    }
1706
1707    /// Creates a new [`TokenBlockSequence`] from a slice of tokens.
1708    ///
1709    /// The tokens are split into blocks of `block_size`. Any remaining tokens
1710    /// form the initial `current_block`.
1711    ///
1712    /// # Arguments
1713    ///
1714    /// * `tokens` - The slice of tokens to create the sequence from.
1715    /// * `block_size` - The size of each block.
1716    /// * `salt_hash` - The [`SaltHash`] to use for hashing.
1717    pub fn from_slice(tokens: &[Token], block_size: u32, salt_hash: Option<SaltHash>) -> Self {
1718        assert!(block_size > 0, "block_size must be greater than 0");
1719        let salt_hash = salt_hash.unwrap_or_default();
1720        let (blocks, current_block) = Self::split_tokens(tokens, block_size, salt_hash);
1721
1722        Self {
1723            blocks,
1724            current_block,
1725            salt_hash,
1726            block_size: block_size as usize,
1727            mm_runs: Vec::new(),
1728        }
1729    }
1730
1731    /// Creates a [`TokenBlockSequence`] with multimodal placeholder runs.
1732    ///
1733    /// `mm_info` is validated and sorted via [`validate_and_sort_mm_info`]. Each block's
1734    /// [`BlockHash`] is computed using the per-block byte encoding documented on
1735    /// [`compute_block_bytes_with_mm`], which selects one of two encodings:
1736    ///
1737    /// - **MM-affected block** (at least one run overlaps the block): every slot emits
1738    ///   13 bytes — a 1-byte tag ([`MM_SLOT_TAG_TOKEN`] or [`MM_SLOT_TAG_PLACEHOLDER`])
1739    ///   followed by a 12-byte payload. Real-token slots carry `token_id u32 LE` plus
1740    ///   8 bytes of padding; placeholder slots carry `run_offset u32 LE | mm_hash u64 LE`.
1741    /// - **Non-MM block** (no run overlaps): the legacy `bytemuck::cast_slice(tokens)`
1742    ///   form is used (4 bytes per slot, LE u32), preserving cache identity with blocks
1743    ///   produced by [`Self::from_slice`].
1744    ///
1745    /// Returns an error if `mm_info` is invalid (overlap, out of bounds, zero-length run).
1746    pub fn new_with_mm(
1747        tokens: Tokens,
1748        mm_info: &[TokenBlockMmInfo],
1749        block_size: u32,
1750        salt_hash: Option<SaltHash>,
1751    ) -> Result<Self, TokenBlockError> {
1752        assert!(block_size > 0, "block_size must be greater than 0");
1753        let salt_hash = salt_hash.unwrap_or_default();
1754        let validated =
1755            validate_and_sort_mm_info(mm_info, tokens.len()).map_err(TokenBlockError::MmInfo)?;
1756        let (blocks, current_block) =
1757            Self::split_tokens_with_mm(&tokens, &validated, block_size, salt_hash);
1758        Ok(Self {
1759            blocks,
1760            current_block,
1761            salt_hash,
1762            block_size: block_size as usize,
1763            mm_runs: validated,
1764        })
1765    }
1766
1767    /// MM-aware variant of [`Self::split_tokens`].
1768    ///
1769    /// `mm_runs` must be pre-validated and sorted (e.g., via [`validate_and_sort_mm_info`]).
1770    pub fn split_tokens_with_mm(
1771        tokens: &[Token],
1772        mm_runs: &[TokenBlockMmInfo],
1773        block_size: u32,
1774        salt_hash: SaltHash,
1775    ) -> (Vec<TokenBlock>, PartialTokenBlock) {
1776        assert!(block_size > 0, "block_size must be greater than 0");
1777        let bs = block_size as usize;
1778        let n_complete = tokens.len() / bs;
1779        let mut result_blocks = Vec::with_capacity(n_complete);
1780        let mut last_seq_hash: Option<SequenceHash> = None;
1781        for i in 0..n_complete {
1782            let block_offset = i * bs;
1783            let block_tokens = &tokens[block_offset..block_offset + bs];
1784            let block_bytes = compute_block_bytes_with_mm(block_tokens, block_offset, mm_runs);
1785            let block_hash = compute_block_hash(&block_bytes, salt_hash);
1786            let chunk = TokenBlockChunk {
1787                tokens: block_tokens.into(),
1788                salt_hash,
1789                block_hash,
1790            };
1791            let new_block = TokenBlock::from_chunk(chunk, last_seq_hash, i);
1792            last_seq_hash = Some(new_block.sequence_hash());
1793            result_blocks.push(new_block);
1794        }
1795        let remainder = &tokens[n_complete * bs..];
1796        let current_block = PartialTokenBlock {
1797            tokens: remainder.into(),
1798            block_size,
1799            salt_hash,
1800            parent_sequence_hash: last_seq_hash,
1801            position: n_complete,
1802        };
1803        (result_blocks, current_block)
1804    }
1805
1806    /// Returns the validated, sorted multimodal runs accumulated by this sequence.
1807    pub fn mm_runs(&self) -> &[TokenBlockMmInfo] {
1808        &self.mm_runs
1809    }
1810
1811    /// Appends a single real token to the sequence.
1812    ///
1813    /// Equivalent to [`Self::append`] but named for symmetry with [`Self::push_mm_run`].
1814    pub fn push_token(&mut self, token: Token) -> Result<Option<usize>, TokenBlockError> {
1815        self.append(token)
1816    }
1817
1818    /// Appends a multimodal placeholder run of `length` slots all tagged with `mm_hash`.
1819    ///
1820    /// The placeholder run starts at the current end of the sequence (`total_tokens()` before
1821    /// the call). The token IDs at placeholder slot positions are filled with zero sentinels;
1822    /// hashing uses the `(mm_hash, run_offset)` pair instead of those token bytes.
1823    ///
1824    /// Returns the range of fully-committed block indices completed during the call (if any).
1825    pub fn push_mm_run(
1826        &mut self,
1827        mm_hash: u64,
1828        length: usize,
1829    ) -> Result<Option<Range<usize>>, TokenBlockError> {
1830        if length == 0 {
1831            return Err(TokenBlockError::MmInfo(MmInfoError::EmptyRun));
1832        }
1833        let offset = self.total_tokens();
1834        self.mm_runs.push(TokenBlockMmInfo {
1835            mm_hash,
1836            offset,
1837            length,
1838        });
1839        // The token values at placeholder slots are opaque for hashing; use 0 sentinels.
1840        let placeholders = Tokens::from(vec![0u32; length]);
1841        self.extend(placeholders)
1842    }
1843
1844    /// Batch-validated extension: appends `tokens` (with embedded multimodal runs in `mm_info`)
1845    /// to the sequence in a single validated step.
1846    ///
1847    /// `mm_info` offsets are **relative to the start of `tokens`** (not the existing sequence).
1848    /// The function validates the chunk, then translates to absolute offsets and applies the
1849    /// updates atomically: it errors before mutating any state if `mm_info` is invalid.
1850    ///
1851    /// Real-token regions and placeholder runs are interleaved per the chunk's layout.
1852    pub fn extend_with_mm(
1853        &mut self,
1854        tokens: &[Token],
1855        mm_info: &[TokenBlockMmInfo],
1856    ) -> Result<Option<Range<usize>>, TokenBlockError> {
1857        let validated =
1858            validate_and_sort_mm_info(mm_info, tokens.len()).map_err(TokenBlockError::MmInfo)?;
1859        let start_block = self.blocks.len();
1860        let mut cursor = 0usize;
1861        for run in &validated {
1862            if run.offset > cursor {
1863                let real = Tokens::from(tokens[cursor..run.offset].to_vec());
1864                self.extend(real)?;
1865            }
1866            self.push_mm_run(run.mm_hash, run.length)?;
1867            cursor = run.offset + run.length;
1868        }
1869        if cursor < tokens.len() {
1870            let real = Tokens::from(tokens[cursor..].to_vec());
1871            self.extend(real)?;
1872        }
1873        let end_block = self.blocks.len();
1874        if start_block == end_block {
1875            Ok(None)
1876        } else {
1877            Ok(Some(start_block..end_block))
1878        }
1879    }
1880}
1881
1882#[cfg(test)]
1883mod tests {
1884    use super::*;
1885    use bytemuck::cast_slice;
1886
1887    // Helper to create a sequence for testing
1888    fn create_test_sequence(
1889        initial_tokens: &[Token],
1890        block_size: u32,
1891        salt_hash: Option<SaltHash>,
1892    ) -> TokenBlockSequence {
1893        TokenBlockSequence::new(Tokens::from(initial_tokens), block_size, salt_hash)
1894    }
1895
1896    // Helper to get expected hashes (replace with actual calculated values if needed)
1897    const TEST_SALT_HASH: SaltHash = 1337;
1898    const HASH_1_4: BlockHash = 14643705804678351452; // hash([1,2,3,4], 1337)
1899    const SEQ_HASH_1_4: SequenceHash = HASH_1_4;
1900    const HASH_5_8: BlockHash = 16777012769546811212; // hash([5,6,7,8], 1337)
1901    const SEQ_HASH_5_8: SequenceHash = 4945711292740353085; // hash([SEQ_HASH_1_4, HASH_5_8], CHAIN_XXH3_SEED)
1902    const HASH_9_12: BlockHash = 483935686894639516; // hash([9,10,11,12], 1337)
1903    const SEQ_HASH_9_12: SequenceHash = 12583592247330656132; // hash([SEQ_HASH_5_8, HASH_9_12], CHAIN_XXH3_SEED)
1904
1905    impl PartialTokenBlock {
1906        /// Attempts to remove the last token from the block.
1907        ///
1908        /// # Returns
1909        ///
1910        /// * `Ok(())` - If a token was successfully removed.
1911        /// * `Err(TokenBlockError::Empty)` - If the block was already empty.
1912        pub fn pop_token(&mut self) -> Result<(), TokenBlockError> {
1913            if self.tokens.0.is_empty() {
1914                return Err(TokenBlockError::Empty);
1915            }
1916            self.tokens.0.pop();
1917            Ok(())
1918        }
1919    }
1920
1921    #[test]
1922    fn test_validate_hash_constants() {
1923        let salt = TEST_SALT_HASH;
1924
1925        // Block 1: [1, 2, 3, 4]
1926        let tokens_1_4 = &[1u32, 2, 3, 4];
1927        let computed_hash_1_4 = compute_block_hash(cast_slice(tokens_1_4), salt);
1928        assert_eq!(computed_hash_1_4, HASH_1_4, "Mismatch for HASH_1_4");
1929        // First block's sequence hash is its block hash
1930        assert_eq!(computed_hash_1_4, SEQ_HASH_1_4, "Mismatch for SEQ_HASH_1_4");
1931
1932        // Block 2: [5, 6, 7, 8]
1933        let tokens_5_8 = &[5u32, 6, 7, 8];
1934        let computed_hash_5_8 = compute_block_hash(cast_slice(tokens_5_8), salt);
1935        assert_eq!(computed_hash_5_8, HASH_5_8, "Mismatch for HASH_5_8");
1936        // Chain step uses the shared CHAIN_XXH3_SEED; salt is already mixed into block_hash.
1937        let computed_seq_hash_5_8 = compute_next_sequence_hash(SEQ_HASH_1_4, HASH_5_8);
1938        assert_eq!(
1939            computed_seq_hash_5_8, SEQ_HASH_5_8,
1940            "Mismatch for SEQ_HASH_5_8"
1941        );
1942
1943        // Block 3: [9, 10, 11, 12]
1944        let tokens_9_12 = &[9u32, 10, 11, 12];
1945        let computed_hash_9_12 = compute_block_hash(cast_slice(tokens_9_12), salt);
1946        assert_eq!(computed_hash_9_12, HASH_9_12, "Mismatch for HASH_9_12");
1947        let computed_seq_hash_9_12 = compute_next_sequence_hash(SEQ_HASH_5_8, HASH_9_12);
1948        assert_eq!(
1949            computed_seq_hash_9_12, SEQ_HASH_9_12,
1950            "Mismatch for SEQ_HASH_9_12"
1951        );
1952    }
1953
1954    #[test]
1955    fn test_positional_sequence_hash_encoding_decoding() {
1956        // Test Mode 0: position fits in 8 bits (< 256)
1957        let seq_hash_0 = 0x1234567890ABCDEF;
1958        let position_0 = 100;
1959        let lbh_0 = 0xFEDCBA9876543210;
1960        let psh_0 = PositionalSequenceHash::new(seq_hash_0, position_0, lbh_0);
1961
1962        assert_eq!(psh_0.mode(), 0, "Position 100 should use mode 0");
1963        assert_eq!(psh_0.sequence_hash(), seq_hash_0);
1964        assert_eq!(psh_0.position(), position_0);
1965        // LBH is truncated to 54 bits in mode 0
1966        assert_eq!(
1967            psh_0.local_block_hash(),
1968            lbh_0 & ((1u64 << 54) - 1),
1969            "LBH should be truncated to 54 bits"
1970        );
1971
1972        // Test Mode 1: position fits in 16 bits (256 <= pos < 65536)
1973        let position_1 = 1000;
1974        let psh_1 = PositionalSequenceHash::new(seq_hash_0, position_1, lbh_0);
1975
1976        assert_eq!(psh_1.mode(), 1, "Position 1000 should use mode 1");
1977        assert_eq!(psh_1.sequence_hash(), seq_hash_0);
1978        assert_eq!(psh_1.position(), position_1);
1979        // LBH is truncated to 46 bits in mode 1
1980        assert_eq!(
1981            psh_1.local_block_hash(),
1982            lbh_0 & ((1u64 << 46) - 1),
1983            "LBH should be truncated to 46 bits"
1984        );
1985
1986        // Test Mode 2: position fits in 24 bits (65536 <= pos < 16777216)
1987        let position_2 = 100_000;
1988        let psh_2 = PositionalSequenceHash::new(seq_hash_0, position_2, lbh_0);
1989
1990        assert_eq!(psh_2.mode(), 2, "Position 100,000 should use mode 2");
1991        assert_eq!(psh_2.sequence_hash(), seq_hash_0);
1992        assert_eq!(psh_2.position(), position_2);
1993        // LBH is truncated to 38 bits in mode 2
1994        assert_eq!(
1995            psh_2.local_block_hash(),
1996            lbh_0 & ((1u64 << 38) - 1),
1997            "LBH should be truncated to 38 bits"
1998        );
1999
2000        // Test Mode 3: position fits in 31 bits (16777216 <= pos < 2^31)
2001        let position_3 = 20_000_000;
2002        let psh_3 = PositionalSequenceHash::new(seq_hash_0, position_3, lbh_0);
2003
2004        assert_eq!(psh_3.mode(), 3, "Position 20,000,000 should use mode 3");
2005        assert_eq!(psh_3.sequence_hash(), seq_hash_0);
2006        assert_eq!(psh_3.position(), position_3);
2007        // LBH is truncated to 31 bits in mode 3
2008        assert_eq!(
2009            psh_3.local_block_hash(),
2010            lbh_0 & ((1u64 << 31) - 1),
2011            "LBH should be truncated to 31 bits"
2012        );
2013
2014        // Test edge case: position at boundary
2015        let position_255 = 255;
2016        let psh_255 = PositionalSequenceHash::new(seq_hash_0, position_255, lbh_0);
2017        assert_eq!(psh_255.mode(), 0, "Position 255 should use mode 0");
2018        assert_eq!(psh_255.position(), position_255);
2019
2020        let position_256 = 256;
2021        let psh_256 = PositionalSequenceHash::new(seq_hash_0, position_256, lbh_0);
2022        assert_eq!(psh_256.mode(), 1, "Position 256 should use mode 1");
2023        assert_eq!(psh_256.position(), position_256);
2024    }
2025
2026    #[test]
2027    fn test_positional_lineage_hash() {
2028        // Test Mode 0: position fits in 8 bits (< 256)
2029        let current_hash_0 = 0x1234567890ABCDEF;
2030        let parent_hash_0 = 0xFEDCBA9876543210;
2031        let position_0 = 100;
2032        let plh_0 = PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_0);
2033
2034        assert_eq!(plh_0.mode(), 0, "Position 100 should use mode 0");
2035        assert_eq!(plh_0.position(), position_0);
2036        // Current carries the full u64; parent fragment is 54 bits in mode 0
2037        assert_eq!(
2038            plh_0.current_sequence_hash(),
2039            current_hash_0,
2040            "Current sequence hash should be stored in full"
2041        );
2042        assert_eq!(
2043            plh_0.parent_hash_fragment(),
2044            parent_hash_0 & ((1u64 << 54) - 1),
2045            "Parent fragment should be truncated to 54 bits in mode 0"
2046        );
2047
2048        // Test Mode 1: position fits in 16 bits (256 <= pos < 65536)
2049        let position_1 = 1000;
2050        let plh_1 = PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_1);
2051
2052        assert_eq!(plh_1.mode(), 1, "Position 1000 should use mode 1");
2053        assert_eq!(plh_1.position(), position_1);
2054        assert_eq!(plh_1.current_sequence_hash(), current_hash_0);
2055        assert_eq!(
2056            plh_1.parent_hash_fragment(),
2057            parent_hash_0 & ((1u64 << 46) - 1),
2058            "Parent fragment should be truncated to 46 bits in mode 1"
2059        );
2060
2061        // Test Mode 2: position fits in 24 bits (65536 <= pos < 16777216)
2062        let position_2 = 100_000;
2063        let plh_2 = PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_2);
2064
2065        assert_eq!(plh_2.mode(), 2, "Position 100,000 should use mode 2");
2066        assert_eq!(plh_2.position(), position_2);
2067        assert_eq!(plh_2.current_sequence_hash(), current_hash_0);
2068        assert_eq!(
2069            plh_2.parent_hash_fragment(),
2070            parent_hash_0 & ((1u64 << 38) - 1),
2071            "Parent fragment should be truncated to 38 bits in mode 2"
2072        );
2073
2074        // Test edge cases: position at boundaries
2075        let position_255 = 255;
2076        let plh_255 = PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_255);
2077        assert_eq!(plh_255.mode(), 0, "Position 255 should use mode 0");
2078        assert_eq!(plh_255.position(), position_255);
2079
2080        let position_256 = 256;
2081        let plh_256 = PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_256);
2082        assert_eq!(plh_256.mode(), 1, "Position 256 should use mode 1");
2083        assert_eq!(plh_256.position(), position_256);
2084
2085        let position_65535 = 65535;
2086        let plh_65535 =
2087            PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_65535);
2088        assert_eq!(plh_65535.mode(), 1, "Position 65535 should use mode 1");
2089        assert_eq!(plh_65535.position(), position_65535);
2090
2091        let position_65536 = 65536;
2092        let plh_65536 =
2093            PositionalLineageHash::new(current_hash_0, Some(parent_hash_0), position_65536);
2094        assert_eq!(plh_65536.mode(), 2, "Position 65536 should use mode 2");
2095        assert_eq!(plh_65536.position(), position_65536);
2096
2097        // Test with None parent (root block)
2098        let plh_root = PositionalLineageHash::new(current_hash_0, None, 0);
2099        assert_eq!(plh_root.mode(), 0);
2100        assert_eq!(plh_root.position(), 0);
2101        assert_eq!(
2102            plh_root.parent_hash_fragment(),
2103            0,
2104            "Root should have zero parent fragment"
2105        );
2106        assert_eq!(plh_root.current_sequence_hash(), current_hash_0);
2107    }
2108
2109    #[test]
2110    #[should_panic(expected = "Position 16777216 exceeds maximum supported value")]
2111    fn test_positional_lineage_hash_panic_on_large_position() {
2112        let current_hash = 0x1234567890ABCDEF;
2113        let parent_hash = 0xFEDCBA9876543210;
2114        let position = 1u64 << 24; // 2^24 = 16,777,216
2115        let _ = PositionalLineageHash::new(current_hash, Some(parent_hash), position);
2116    }
2117
2118    #[test]
2119    fn test_positional_lineage_hash_mode_boundary_alignment() {
2120        // Test that backward radix traversal still works across mode boundaries.
2121        // Under the asymmetric layout, the parent fragment is sized to the *child's*
2122        // mode and pulled from the parent's full u64 via
2123        // `parent_fragment_for_child_position`.
2124
2125        let parent_hash = 0xFEDCBA9876543210;
2126        let current_hash_255 = 0x1234567890ABCDEF;
2127        let current_hash_256 = 0xABCDEF0123456789;
2128
2129        // Position 255: Mode 0 (last position before boundary)
2130        let plh_255 = PositionalLineageHash::new(current_hash_255, Some(parent_hash), 255);
2131        assert_eq!(plh_255.mode(), 0);
2132        assert_eq!(plh_255.current_sequence_hash(), current_hash_255);
2133
2134        // Position 256: Mode 1 (first position after boundary)
2135        let plh_256 = PositionalLineageHash::new(current_hash_256, Some(current_hash_255), 256);
2136        assert_eq!(plh_256.mode(), 1);
2137
2138        // CRITICAL: child's parent_fragment must equal parent's current truncated to
2139        // the child's mode width (46 bits at mode 1).
2140        let mask_46 = (1u64 << 46) - 1;
2141        assert_eq!(
2142            plh_256.parent_hash_fragment(),
2143            current_hash_255 & mask_46,
2144            "Mode boundary: position 256's parent fragment matches position 255's current truncated to 46 bits"
2145        );
2146        assert_eq!(
2147            plh_256.parent_hash_fragment(),
2148            plh_255.parent_fragment_for_child_position(256),
2149            "parent_fragment_for_child_position helper should match"
2150        );
2151
2152        // Test the other boundary: 65535 -> 65536 (Mode 1 -> Mode 2)
2153        let current_hash_65535 = 0x1111222233334444;
2154        let current_hash_65536 = 0x5555666677778888;
2155
2156        let plh_65535 = PositionalLineageHash::new(current_hash_65535, Some(parent_hash), 65535);
2157        assert_eq!(plh_65535.mode(), 1);
2158
2159        let plh_65536 =
2160            PositionalLineageHash::new(current_hash_65536, Some(current_hash_65535), 65536);
2161        assert_eq!(plh_65536.mode(), 2);
2162
2163        // Mode 2 parent fragment is 38 bits.
2164        let mask_38 = (1u64 << 38) - 1;
2165        assert_eq!(
2166            plh_65536.parent_hash_fragment(),
2167            current_hash_65535 & mask_38,
2168            "Mode boundary: position 65536's parent fragment matches position 65535's current truncated to 38 bits"
2169        );
2170        assert_eq!(
2171            plh_65536.parent_hash_fragment(),
2172            plh_65535.parent_fragment_for_child_position(65536),
2173        );
2174    }
2175
2176    #[test]
2177    fn test_positional_lineage_hash_extend() {
2178        // PLH must be self-extending: a chain built from PLH::root + extend should be
2179        // bitwise identical to the chain produced by full TokenBlock construction.
2180        let salt: SaltHash = 1337;
2181        let bh: [BlockHash; 3] = [
2182            compute_block_hash(cast_slice(&[1u32, 2, 3, 4]), salt),
2183            compute_block_hash(cast_slice(&[5u32, 6, 7, 8]), salt),
2184            compute_block_hash(cast_slice(&[9u32, 10, 11, 12]), salt),
2185        ];
2186
2187        // Direct construction via TokenBlock
2188        let blk0 =
2189            TokenBlock::from_chunk(TokenBlockChunk::from_tokens(&[1, 2, 3, 4], salt), None, 0);
2190        let blk1 = TokenBlock::from_chunk(
2191            TokenBlockChunk::from_tokens(&[5, 6, 7, 8], salt),
2192            Some(blk0.sequence_hash()),
2193            1,
2194        );
2195        let blk2 = TokenBlock::from_chunk(
2196            TokenBlockChunk::from_tokens(&[9, 10, 11, 12], salt),
2197            Some(blk1.sequence_hash()),
2198            2,
2199        );
2200
2201        // Self-extending construction via PLH::root + extend
2202        let plh0 = PositionalLineageHash::root(bh[0]);
2203        let plh1 = plh0.extend(bh[1]);
2204        let plh2 = plh1.extend(bh[2]);
2205
2206        assert_eq!(plh0.as_u128(), blk0.positional_lineage_hash().as_u128());
2207        assert_eq!(plh1.as_u128(), blk1.positional_lineage_hash().as_u128());
2208        assert_eq!(plh2.as_u128(), blk2.positional_lineage_hash().as_u128());
2209
2210        // Chain definition: extended.current_sequence_hash == compute_next_sequence_hash(parent, child)
2211        assert_eq!(
2212            plh1.current_sequence_hash(),
2213            compute_next_sequence_hash(plh0.current_sequence_hash(), bh[1]),
2214        );
2215
2216        // Salt propagation: changing salt changes block_hash[0] and therefore every
2217        // subsequent PLH, even though salt no longer seeds the per-step chain.
2218        let alt_salt: SaltHash = 4242;
2219        let alt_bh0 = compute_block_hash(cast_slice(&[1u32, 2, 3, 4]), alt_salt);
2220        assert_ne!(alt_bh0, bh[0]);
2221        let alt_plh0 = PositionalLineageHash::root(alt_bh0);
2222        let alt_plh1 = alt_plh0.extend(compute_block_hash(cast_slice(&[5u32, 6, 7, 8]), alt_salt));
2223        assert_ne!(alt_plh0.as_u128(), plh0.as_u128());
2224        assert_ne!(alt_plh1.as_u128(), plh1.as_u128());
2225    }
2226
2227    #[test]
2228    fn test_tokens_from() {
2229        let vec_u32: Vec<u32> = vec![1, 2, 3];
2230        let tokens_u32: Tokens = vec_u32.clone().into();
2231        assert_eq!(tokens_u32.0, vec_u32);
2232
2233        let slice_u32: &[u32] = &[4, 5];
2234        let tokens_slice_u32: Tokens = slice_u32.into();
2235        assert_eq!(tokens_slice_u32.0, vec![4, 5]);
2236
2237        let vec_i32: Vec<i32> = vec![-1, 0, 1]; // Note: -1 becomes large u32
2238        let tokens_i32: Tokens = vec_i32.into();
2239        assert_eq!(tokens_i32.0, vec![u32::MAX, 0, 1]);
2240
2241        let slice_i32: &[i32] = &[100, 200];
2242        let tokens_slice_i32: Tokens = slice_i32.into();
2243        assert_eq!(tokens_slice_i32.0, vec![100, 200]);
2244
2245        let into_vec: Vec<u32> = tokens_slice_i32.into();
2246        assert_eq!(into_vec, vec![100, 200]);
2247    }
2248
2249    #[test]
2250    fn test_tokens_equality() {
2251        let tokens = Tokens::from(vec![1, 2, 3]);
2252        assert_eq!(tokens, vec![1, 2, 3]);
2253        assert_eq!(vec![1, 2, 3], tokens);
2254        assert_eq!(tokens, &[1, 2, 3][..]);
2255        assert_eq!(&[1, 2, 3][..], tokens);
2256        assert_eq!(tokens, Tokens::from(vec![1, 2, 3]));
2257        assert_ne!(tokens, Tokens::from(vec![1, 2, 4]));
2258    }
2259
2260    #[test]
2261    fn test_tokens_deref_asref() {
2262        let tokens = Tokens::from(vec![10, 20, 30]);
2263
2264        // Deref to &[Token]
2265        assert_eq!(tokens.len(), 3);
2266        assert_eq!(tokens[1], 20);
2267        let slice: &[Token] = &tokens;
2268        assert_eq!(slice, &[10, 20, 30]);
2269
2270        // AsRef<[Token]>
2271        let as_ref_slice: &[Token] = tokens.as_ref();
2272        assert_eq!(as_ref_slice, &[10, 20, 30]);
2273
2274        // Borrow<[Token]>
2275        let borrowed_slice: &[Token] = std::borrow::Borrow::borrow(&tokens);
2276        assert_eq!(borrowed_slice, &[10, 20, 30]);
2277    }
2278
2279    #[test]
2280    fn test_tokens_into_sequence() {
2281        let tokens = Tokens::from(vec![1, 2, 3, 4, 5]);
2282        let seq = tokens.into_sequence(3, Some(TEST_SALT_HASH));
2283        assert_eq!(seq.blocks().len(), 1);
2284        assert_eq!(seq.blocks[0].tokens().as_ref(), &[1, 2, 3]);
2285        assert_eq!(seq.current_block().tokens().as_ref(), &[4, 5]);
2286        assert_eq!(seq.salt_hash(), TEST_SALT_HASH);
2287    }
2288
2289    #[test]
2290    fn test_partial_block_ops() {
2291        let mut partial = PartialTokenBlock::create_sequence_root(3, TEST_SALT_HASH);
2292        assert_eq!(partial.len(), 0);
2293        assert_eq!(partial.remaining(), 3);
2294        assert!(partial.is_empty());
2295
2296        // Push tokens
2297        assert!(partial.push_token(1).is_ok());
2298        assert_eq!(partial.len(), 1);
2299        assert_eq!(partial.remaining(), 2);
2300        let remaining = partial.push_tokens(Tokens::from(vec![2, 3, 4]));
2301        assert_eq!(partial.len(), 3);
2302        assert_eq!(partial.remaining(), 0);
2303        assert_eq!(remaining.as_ref(), &[4]); // Token 4 didn't fit
2304        assert_eq!(partial.tokens().as_ref(), &[1, 2, 3]);
2305
2306        // Push when full
2307        assert_eq!(partial.push_token(5), Err(TokenBlockError::Full));
2308        let remaining_full = partial.push_tokens(Tokens::from(vec![5]));
2309        assert_eq!(remaining_full.as_ref(), &[5]);
2310
2311        // Pop tokens
2312        assert!(partial.pop_token().is_ok());
2313        assert_eq!(partial.len(), 2);
2314        assert_eq!(partial.tokens().as_ref(), &[1, 2]);
2315        assert!(partial.pop_tokens(2).is_ok());
2316        assert!(partial.is_empty());
2317
2318        // Pop when empty
2319        assert_eq!(partial.pop_token(), Err(TokenBlockError::Empty));
2320        assert_eq!(
2321            partial.pop_tokens(1),
2322            Err(TokenBlockError::InsufficientTokens)
2323        );
2324
2325        // Commit incomplete
2326        assert!(partial.push_token(10).is_ok());
2327        assert_eq!(partial.commit(), Err(TokenBlockError::Incomplete));
2328
2329        // Commit complete
2330        assert!(partial.push_token(11).is_ok());
2331        assert!(partial.push_token(12).is_ok());
2332        assert_eq!(partial.len(), 3);
2333        let commit_result = partial.commit();
2334        assert!(commit_result.is_ok());
2335        let committed_block = commit_result.unwrap();
2336        assert_eq!(committed_block.tokens().as_ref(), &[10, 11, 12]);
2337
2338        // Check state after commit (partial block is now the next one)
2339        assert!(partial.is_empty());
2340        assert_eq!(
2341            partial.parent_sequence_hash,
2342            Some(committed_block.sequence_hash())
2343        );
2344        assert_eq!(partial.block_size, 3);
2345    }
2346
2347    #[test]
2348    fn test_token_block_creation_and_hashes() {
2349        let salt = TEST_SALT_HASH;
2350        let tokens1 = Tokens::from(vec![1, 2, 3, 4]);
2351        let chunk1 = TokenBlockChunk::new(tokens1.clone(), salt);
2352        let block1 = TokenBlock::from_chunk(chunk1, None, 0);
2353
2354        assert_eq!(block1.tokens(), &tokens1);
2355        assert_eq!(block1.salt_hash(), salt);
2356        assert_eq!(block1.parent_sequence_hash(), None);
2357        assert_eq!(block1.block_hash(), HASH_1_4);
2358        assert_eq!(block1.sequence_hash(), SEQ_HASH_1_4); // First block seq_hash == block_hash
2359        assert_eq!(block1.position(), 0); // First block is at position 0
2360
2361        // Verify positional lineage hash for block 1
2362        let plh1 = block1.positional_lineage_hash();
2363        assert_eq!(plh1.position(), 0);
2364        assert_eq!(plh1.parent_hash_fragment(), 0); // Root has no parent
2365        assert_eq!(plh1.current_sequence_hash(), SEQ_HASH_1_4);
2366
2367        let tokens2 = Tokens::from(vec![5, 6, 7, 8]);
2368        let chunk2 = TokenBlockChunk::new(tokens2.clone(), salt);
2369        let block2 = TokenBlock::from_chunk(chunk2, block1.parent_sequence_hash(), 1); // Incorrect parent
2370        // Sequence hash should differ if parent is wrong
2371        assert_ne!(block2.sequence_hash(), SEQ_HASH_5_8);
2372
2373        let chunk2_correct = TokenBlockChunk::new(tokens2.clone(), salt);
2374        let block2_correct =
2375            TokenBlock::from_chunk(chunk2_correct, Some(block1.sequence_hash()), 1);
2376
2377        assert_eq!(block2_correct.tokens(), &tokens2);
2378        assert_eq!(block2_correct.salt_hash(), salt);
2379        assert_eq!(
2380            block2_correct.parent_sequence_hash(),
2381            Some(block1.sequence_hash())
2382        );
2383        assert_eq!(block2_correct.block_hash(), HASH_5_8);
2384        assert_eq!(block2_correct.sequence_hash(), SEQ_HASH_5_8);
2385        assert_eq!(block2_correct.position(), 1); // Second block is at position 1
2386
2387        // Verify positional lineage hash for block 2
2388        let plh2 = block2_correct.positional_lineage_hash();
2389        assert_eq!(plh2.position(), 1);
2390        assert_eq!(
2391            plh2.parent_hash_fragment(),
2392            SEQ_HASH_1_4 & ((1u64 << 54) - 1)
2393        ); // Parent fragment is the parent's u64 truncated to 54 bits in mode 0
2394        assert_eq!(plh2.current_sequence_hash(), SEQ_HASH_5_8);
2395    }
2396
2397    #[test]
2398    fn test_new_sequence() {
2399        // Empty initial tokens
2400        let seq_empty = create_test_sequence(&[], 4, Some(TEST_SALT_HASH));
2401        assert!(seq_empty.blocks().is_empty());
2402        assert!(seq_empty.current_block().is_empty());
2403        assert_eq!(seq_empty.total_tokens(), 0);
2404        assert_eq!(seq_empty.salt_hash(), TEST_SALT_HASH);
2405        assert_eq!(seq_empty.current_block().parent_sequence_hash, None);
2406
2407        // Less than one block
2408        let seq_partial = create_test_sequence(&[1, 2], 4, Some(TEST_SALT_HASH));
2409        assert!(seq_partial.blocks().is_empty());
2410        assert_eq!(seq_partial.current_block().tokens().as_ref(), &[1, 2]);
2411        assert_eq!(seq_partial.total_tokens(), 2);
2412        assert_eq!(seq_partial.current_block().parent_sequence_hash, None);
2413
2414        // Exactly one block
2415        let seq_one_block = create_test_sequence(&[1, 2, 3, 4], 4, Some(TEST_SALT_HASH));
2416        assert_eq!(seq_one_block.blocks().len(), 1);
2417        assert!(seq_one_block.current_block().is_empty());
2418        assert_eq!(seq_one_block.total_tokens(), 4);
2419        assert_eq!(seq_one_block.blocks[0].tokens().as_ref(), &[1, 2, 3, 4]);
2420        assert_eq!(seq_one_block.blocks[0].sequence_hash(), SEQ_HASH_1_4);
2421        assert_eq!(
2422            seq_one_block.current_block().parent_sequence_hash,
2423            Some(SEQ_HASH_1_4)
2424        );
2425
2426        // More than one block
2427        let seq_multi = create_test_sequence(&[1, 2, 3, 4, 5, 6, 7, 8, 9], 4, Some(TEST_SALT_HASH));
2428        assert_eq!(seq_multi.blocks().len(), 2);
2429        assert_eq!(seq_multi.current_block().tokens().as_ref(), &[9]);
2430        assert_eq!(seq_multi.total_tokens(), 9);
2431        assert_eq!(seq_multi.blocks[0].sequence_hash(), SEQ_HASH_1_4);
2432        assert_eq!(seq_multi.blocks[1].sequence_hash(), SEQ_HASH_5_8);
2433        assert_eq!(
2434            seq_multi.current_block().parent_sequence_hash,
2435            Some(SEQ_HASH_5_8)
2436        );
2437
2438        // Test tokens_at across blocks and partial block
2439        assert_eq!(seq_multi.tokens_at(0..4).as_ref(), &[1, 2, 3, 4]); // First complete block
2440        assert_eq!(seq_multi.tokens_at(4..8).as_ref(), &[5, 6, 7, 8]); // Second complete block
2441        assert_eq!(seq_multi.tokens_at(8..9).as_ref(), &[9]); // Current partial block
2442        assert_eq!(seq_multi.tokens_at(2..6).as_ref(), &[3, 4, 5, 6]); // Spanning blocks
2443        assert_eq!(seq_multi.tokens_at(6..9).as_ref(), &[7, 8, 9]); // Spanning to partial
2444        assert_eq!(seq_multi.tokens_at(5..5).as_ref(), &[0u32; 0]); // Empty range
2445        assert_eq!(seq_multi.tokens_at(10..15).as_ref(), &[0u32; 0]); // Out of bounds
2446
2447        // No salt hash
2448        let seq_no_salt = create_test_sequence(&[1, 2, 3, 4, 5], 4, None);
2449        assert_eq!(seq_no_salt.salt_hash(), 0);
2450        assert_eq!(seq_no_salt.blocks().len(), 1);
2451        assert_ne!(seq_no_salt.blocks[0].block_hash(), HASH_1_4); // Hash differs with salt 0
2452        assert_eq!(seq_no_salt.current_block().tokens().as_ref(), &[5]);
2453    }
2454
2455    #[test]
2456    #[should_panic]
2457    fn test_new_sequence_zero_block_size() {
2458        let _ = create_test_sequence(&[1], 0, None);
2459    }
2460
2461    #[test]
2462    fn test_append_single_token() {
2463        let mut sequence =
2464            create_test_sequence(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 4, Some(TEST_SALT_HASH));
2465        assert_eq!(sequence.blocks().len(), 2);
2466        assert_eq!(sequence.current_block().tokens.len(), 2);
2467        assert_eq!(sequence.current_block().tokens, vec![9, 10]);
2468        assert_eq!(
2469            sequence.current_block().parent_sequence_hash,
2470            Some(SEQ_HASH_5_8)
2471        );
2472
2473        // Append token 11 - should not complete a block
2474        let completed_idx = sequence.append(11).unwrap();
2475        assert_eq!(completed_idx, None);
2476        assert_eq!(sequence.blocks().len(), 2);
2477        assert_eq!(sequence.current_block().tokens.as_ref(), &[9, 10, 11]);
2478
2479        // Append token 12 - should complete block 2 (index 2)
2480        // This will also commit block 2
2481        let completed_idx = sequence.append(12).unwrap();
2482        assert_eq!(completed_idx, Some(2));
2483        assert_eq!(sequence.blocks().len(), 3);
2484        assert_eq!(sequence.current_block.tokens.as_ref(), &[0u32; 0]);
2485        assert_eq!(sequence.current_block.remaining(), 4);
2486        assert_eq!(
2487            sequence.current_block().parent_sequence_hash,
2488            Some(SEQ_HASH_9_12)
2489        ); // Still linked to block 1
2490
2491        // Append token 13 - should not complete a block
2492        let completed_idx_13 = sequence.append(13).unwrap();
2493        assert_eq!(completed_idx_13, None);
2494        assert_eq!(sequence.blocks().len(), 3);
2495        assert_eq!(sequence.blocks[2].tokens().as_ref(), &[9, 10, 11, 12]);
2496        assert_eq!(sequence.blocks[2].sequence_hash(), SEQ_HASH_9_12);
2497        assert_eq!(sequence.current_block.tokens.as_ref(), &[13]); // New current block has 13
2498        assert_eq!(sequence.current_block.remaining(), 3);
2499        assert_eq!(
2500            sequence.current_block.parent_sequence_hash,
2501            Some(SEQ_HASH_9_12)
2502        ); // Linked to new block 2
2503    }
2504
2505    #[test]
2506    fn test_extend() {
2507        let block_size = 4;
2508        let salt_hash = Some(TEST_SALT_HASH);
2509
2510        // Case 1: Extend less than block size
2511        let mut seq1 = create_test_sequence(&[], block_size, salt_hash);
2512        let tokens1 = Tokens::from(vec![1, 2]);
2513        let completed1 = seq1.extend(tokens1).unwrap();
2514        assert_eq!(completed1, None); // No blocks completed
2515        assert_eq!(seq1.blocks.len(), 0);
2516        assert_eq!(seq1.current_block.tokens.as_ref(), &[1, 2]);
2517        assert_eq!(seq1.current_block.remaining(), 2);
2518        assert_eq!(seq1.current_block.parent_sequence_hash, None); // Still the root block
2519
2520        // Case 2: Extend exactly block size
2521        let mut seq2 = create_test_sequence(&[], block_size, salt_hash);
2522        let tokens2 = Tokens::from(vec![1, 2, 3, 4]);
2523        let completed2 = seq2.extend(tokens2).unwrap();
2524        assert_eq!(completed2, Some(0..1));
2525        assert_eq!(seq2.blocks.len(), 1);
2526        assert_eq!(seq2.current_block.tokens.as_ref(), &[0u32; 0]); // Current block is empty
2527        assert_eq!(seq2.current_block.remaining(), 4);
2528        assert_eq!(seq2.current_block.parent_sequence_hash, Some(SEQ_HASH_1_4)); // Still the root block
2529
2530        // Case 3: Extend more than block size, less than two blocks
2531        let mut seq3 = create_test_sequence(&[], block_size, salt_hash);
2532        let tokens3 = Tokens::from(vec![1, 2, 3, 4, 5, 6]);
2533        let completed3 = seq3.extend(tokens3).unwrap();
2534        assert_eq!(completed3, Some(0..1)); // Block at index 0 completed
2535        assert_eq!(seq3.blocks.len(), 1);
2536        assert_eq!(seq3.current_block.tokens.as_ref(), &[5, 6]); // Partial block has remainder
2537        assert_eq!(seq3.blocks[0].tokens().as_ref(), &[1, 2, 3, 4]);
2538        assert_eq!(seq3.current_block.parent_sequence_hash, Some(SEQ_HASH_1_4));
2539        assert_eq!(seq3.current_block.remaining(), 2);
2540
2541        // Case 4: Extend exactly two blocks
2542        let mut seq4 = create_test_sequence(&[], block_size, salt_hash);
2543        let tokens4 = Tokens::from(vec![1, 2, 3, 4, 5, 6, 7, 8]);
2544        let completed4 = seq4.extend(tokens4).unwrap();
2545        assert_eq!(completed4, Some(0..2)); // Only block 0 is committed
2546        assert_eq!(seq4.blocks.len(), 2); // Only 1 block committed
2547        assert_eq!(seq4.current_block.tokens.as_ref(), &[0u32; 0]);
2548        assert_eq!(seq4.current_block.remaining(), 4);
2549        assert_eq!(seq4.blocks[0].tokens().as_ref(), &[1, 2, 3, 4]);
2550        assert_eq!(seq4.blocks[0].sequence_hash(), SEQ_HASH_1_4);
2551        assert_eq!(seq4.current_block.parent_sequence_hash, Some(SEQ_HASH_5_8)); // Parent is the first block
2552
2553        // Case 5: Extend multiple times, completing blocks across calls
2554        let mut seq5 = create_test_sequence(&[], block_size, salt_hash);
2555        let tokens5a = Tokens::from(vec![1, 2]);
2556        let completed5a = seq5.extend(tokens5a).unwrap();
2557        assert_eq!(completed5a, None);
2558        assert_eq!(seq5.blocks.len(), 0);
2559        assert_eq!(seq5.current_block.tokens.as_ref(), &[1, 2]);
2560
2561        let tokens5b = Tokens::from(vec![3, 4, 5]);
2562        let completed5b = seq5.extend(tokens5b).unwrap();
2563        assert_eq!(completed5b, Some(0..1)); // Block at index 0 completed
2564        assert_eq!(seq5.blocks.len(), 1);
2565        assert_eq!(seq5.current_block.tokens.as_ref(), &[5]);
2566        assert_eq!(seq5.blocks[0].tokens().as_ref(), &[1, 2, 3, 4]);
2567        assert_eq!(seq5.current_block.parent_sequence_hash, Some(SEQ_HASH_1_4));
2568        assert_eq!(seq5.current_block.remaining(), 3);
2569
2570        let tokens5c = Tokens::from(vec![6, 7, 8, 9, 10]);
2571        let completed5c = seq5.extend(tokens5c).unwrap();
2572        assert_eq!(completed5c, Some(1..2)); // Block at index 1 completed
2573        assert_eq!(seq5.blocks.len(), 2);
2574        assert_eq!(seq5.current_block.tokens.as_ref(), &[9, 10]);
2575        assert_eq!(seq5.blocks[1].tokens().as_ref(), &[5, 6, 7, 8]);
2576        assert_eq!(seq5.current_block.parent_sequence_hash, Some(SEQ_HASH_5_8));
2577        assert_eq!(seq5.current_block.remaining(), 2);
2578
2579        // Case 6: Extend empty tokens
2580        let mut seq6 = create_test_sequence(&[1], block_size, salt_hash);
2581        let completed6 = seq6.extend(Tokens::default()).unwrap();
2582        assert_eq!(completed6, None);
2583        assert_eq!(seq6.blocks.len(), 0);
2584        assert_eq!(seq6.current_block.tokens.as_ref(), &[1]);
2585        assert_eq!(seq6.total_tokens(), 1);
2586
2587        // Case 7: Extend fills current exactly, no remainder
2588        let mut seq7 = create_test_sequence(&[1, 2], block_size, salt_hash);
2589        let tokens7 = Tokens::from(vec![3, 4]);
2590        let completed7 = seq7.extend(tokens7).unwrap();
2591        assert_eq!(completed7, Some(0..1)); // Block is full but not committed yet
2592        assert_eq!(seq7.blocks.len(), 1);
2593        assert_eq!(seq7.current_block.tokens.as_ref(), &[0u32; 0]); // Current block is full
2594        assert_eq!(seq7.current_block.remaining(), 4);
2595        assert_eq!(seq7.total_tokens(), 4);
2596        assert_eq!(seq7.current_block.parent_sequence_hash, Some(SEQ_HASH_1_4)); // Still the root block
2597
2598        // Test tokens_at extraction
2599        assert_eq!(seq7.tokens_at(0..2).as_ref(), &[1, 2]);
2600        assert_eq!(seq7.tokens_at(1..3).as_ref(), &[2, 3]);
2601        assert_eq!(seq7.tokens_at(0..4).as_ref(), &[1, 2, 3, 4]);
2602        assert_eq!(seq7.tokens_at(2..2).as_ref(), &[0u32; 0]); // Empty range
2603    }
2604
2605    #[test]
2606    fn test_truncate() {
2607        let block_size = 4;
2608        let salt_hash = Some(TEST_SALT_HASH);
2609        let initial_tokens = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; // 10 tokens
2610
2611        // Case 1: Truncate within current block (len 9)
2612        let mut seq1 = create_test_sequence(initial_tokens, block_size, salt_hash);
2613        assert!(seq1.truncate(9).is_ok());
2614        assert_eq!(seq1.total_tokens(), 9);
2615        assert_eq!(seq1.blocks().len(), 2);
2616        assert_eq!(seq1.current_block().tokens.as_ref(), &[9]);
2617        assert_eq!(
2618            seq1.current_block().parent_sequence_hash,
2619            Some(SEQ_HASH_5_8)
2620        );
2621
2622        // Case 2: Truncate to exact block boundary (len 8)
2623        let mut seq2 = create_test_sequence(initial_tokens, block_size, salt_hash);
2624        assert!(seq2.truncate(8).is_ok());
2625        assert_eq!(seq2.total_tokens(), 8);
2626        assert_eq!(seq2.blocks().len(), 2);
2627        assert!(seq2.current_block().tokens.is_empty());
2628        assert_eq!(
2629            seq2.current_block().parent_sequence_hash,
2630            Some(SEQ_HASH_5_8)
2631        );
2632
2633        // Case 3: Truncate into last full block (len 7)
2634        let mut seq3 = create_test_sequence(initial_tokens, block_size, salt_hash);
2635        assert!(seq3.truncate(7).is_ok());
2636        assert_eq!(seq3.total_tokens(), 7);
2637        assert_eq!(seq3.blocks().len(), 1); // Block [5,6,7,8] removed conceptually
2638        assert_eq!(seq3.current_block().tokens.as_ref(), &[5, 6, 7]); // Kept 3 from [5,6,7,8]
2639        assert_eq!(
2640            seq3.current_block().parent_sequence_hash,
2641            Some(SEQ_HASH_1_4)
2642        ); // Parent is hash of [1,2,3,4]
2643        assert_eq!(seq3.blocks()[0].tokens().as_ref(), &[1, 2, 3, 4]);
2644
2645        // Case 4: Truncate removing full block(s) exactly (len 4)
2646        let mut seq4 = create_test_sequence(initial_tokens, block_size, salt_hash);
2647        assert!(seq4.truncate(4).is_ok());
2648        assert_eq!(seq4.total_tokens(), 4);
2649        assert_eq!(seq4.blocks().len(), 1); // Block [5,6,7,8] removed
2650        assert!(seq4.current_block().tokens.is_empty()); // New partial based on block [1,2,3,4]
2651        assert_eq!(
2652            seq4.current_block().parent_sequence_hash,
2653            Some(SEQ_HASH_1_4)
2654        );
2655        assert_eq!(seq4.blocks()[0].tokens().as_ref(), &[1, 2, 3, 4]);
2656
2657        // Case 5: Truncate into first block (len 3)
2658        let mut seq5 = create_test_sequence(initial_tokens, block_size, salt_hash);
2659        assert!(seq5.truncate(3).is_ok());
2660        assert_eq!(seq5.total_tokens(), 3);
2661        assert!(seq5.blocks().is_empty()); // Both blocks removed conceptually
2662        assert_eq!(seq5.current_block().tokens.as_ref(), &[1, 2, 3]); // Kept 3 from [1,2,3,4]
2663        assert_eq!(seq5.current_block().parent_sequence_hash, None); // No parent
2664
2665        // Case 6: Truncate to zero length (len 0)
2666        let mut seq6 = create_test_sequence(initial_tokens, block_size, salt_hash);
2667        assert!(seq6.truncate(0).is_ok());
2668        assert_eq!(seq6.total_tokens(), 0);
2669        assert!(seq6.blocks().is_empty());
2670        assert!(seq6.current_block().tokens.is_empty());
2671        assert_eq!(seq6.current_block().parent_sequence_hash, None);
2672
2673        // Case 7: Truncate to length greater than current (len 11)
2674        let mut seq7 = create_test_sequence(initial_tokens, block_size, salt_hash);
2675        let original_state = (seq7.blocks.clone(), seq7.current_block.tokens.clone()); // Clone for state check
2676        assert!(seq7.truncate(11).is_ok()); // Should have no effect
2677        assert_eq!(seq7.total_tokens(), 10);
2678        assert_eq!(seq7.blocks, original_state.0);
2679        assert_eq!(seq7.current_block.tokens, original_state.1);
2680
2681        // Case 8: Truncate to current length (len 10)
2682        let mut seq8 = create_test_sequence(initial_tokens, block_size, salt_hash);
2683        let original_state = (seq8.blocks.clone(), seq8.current_block.tokens.clone());
2684        assert!(seq8.truncate(10).is_ok());
2685        assert_eq!(seq8.total_tokens(), 10);
2686        assert_eq!(seq8.blocks, original_state.0);
2687        assert_eq!(seq8.current_block.tokens, original_state.1);
2688
2689        // Case 9: Truncate an empty sequence to 0
2690        let mut seq9 = create_test_sequence(&[], block_size, salt_hash);
2691        assert!(seq9.truncate(0).is_ok());
2692        assert_eq!(seq9.total_tokens(), 0);
2693        assert!(seq9.blocks().is_empty());
2694        assert!(seq9.current_block().tokens.is_empty());
2695
2696        // Case 10: Truncate on exact block boundary when current is empty (len 4)
2697        let tokens10 = &[1, 2, 3, 4, 5, 6, 7, 8]; // 8 tokens
2698        let mut seq10 = create_test_sequence(tokens10, block_size, salt_hash);
2699        assert_eq!(seq10.total_tokens(), 8);
2700        assert!(seq10.current_block().is_empty());
2701        assert!(seq10.truncate(4).is_ok()); // Remove block [5, 6, 7, 8]
2702        assert_eq!(seq10.total_tokens(), 4);
2703        assert_eq!(seq10.blocks().len(), 1);
2704        assert!(seq10.current_block().tokens.is_empty());
2705        assert_eq!(
2706            seq10.current_block().parent_sequence_hash,
2707            Some(SEQ_HASH_1_4)
2708        );
2709
2710        // Case 11: Truncate into first block when current is empty (len 3)
2711        let tokens11 = &[1, 2, 3, 4, 5, 6, 7, 8]; // 8 tokens
2712        let mut seq11 = create_test_sequence(tokens11, block_size, salt_hash);
2713        assert!(seq11.truncate(3).is_ok()); // Pop block [5,6,7,8] + 1 from [1,2,3,4]
2714        assert_eq!(seq11.total_tokens(), 3);
2715        assert!(seq11.blocks().is_empty());
2716        assert_eq!(seq11.current_block().tokens.as_ref(), &[1, 2, 3]); // Kept 3 from [1,2,3,4]
2717        assert_eq!(seq11.current_block().parent_sequence_hash, None);
2718    }
2719
2720    #[test]
2721    fn test_unwind() {
2722        let block_size = 4;
2723        let salt_hash = Some(TEST_SALT_HASH);
2724        let initial_tokens = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; // 10 tokens
2725
2726        // Unwind 0
2727        let mut seq = create_test_sequence(initial_tokens, block_size, salt_hash);
2728        assert!(seq.unwind(0).is_ok());
2729        assert_eq!(seq.total_tokens(), 10);
2730
2731        // Unwind 1
2732        let mut seq = create_test_sequence(initial_tokens, block_size, salt_hash);
2733        assert!(seq.unwind(1).is_ok());
2734        assert_eq!(seq.total_tokens(), 9);
2735        assert_eq!(seq.current_block.tokens.as_ref(), &[9]);
2736
2737        // Unwind 3 (crosses boundary)
2738        let mut seq = create_test_sequence(initial_tokens, block_size, salt_hash);
2739        assert!(seq.unwind(3).is_ok());
2740        assert_eq!(seq.total_tokens(), 7);
2741        assert_eq!(seq.blocks.len(), 1);
2742        assert_eq!(seq.current_block.tokens.as_ref(), &[5, 6, 7]);
2743
2744        // Unwind all (10)
2745        let mut seq = create_test_sequence(initial_tokens, block_size, salt_hash);
2746        assert!(seq.unwind(10).is_ok());
2747        assert_eq!(seq.total_tokens(), 0);
2748        assert!(seq.blocks.is_empty());
2749        assert!(seq.current_block.is_empty());
2750
2751        // Unwind more than available (11)
2752        let mut seq = create_test_sequence(initial_tokens, block_size, salt_hash);
2753        assert_eq!(seq.unwind(11), Err(TokenBlockError::InsufficientTokens));
2754        assert_eq!(seq.total_tokens(), 10); // State unchanged
2755
2756        // Unwind from empty
2757        let mut seq_empty = create_test_sequence(&[], block_size, salt_hash);
2758        assert_eq!(
2759            seq_empty.unwind(1),
2760            Err(TokenBlockError::InsufficientTokens)
2761        );
2762    }
2763
2764    #[test]
2765    fn test_pop() {
2766        let block_size = 4;
2767        let salt_hash = Some(TEST_SALT_HASH);
2768        let initial_tokens = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; // 10 tokens
2769
2770        let mut seq = create_test_sequence(initial_tokens, block_size, salt_hash);
2771
2772        // Pop 10
2773        assert_eq!(seq.pop(), Some(10));
2774        assert_eq!(seq.total_tokens(), 9);
2775        assert_eq!(seq.current_block.tokens.as_ref(), &[9]);
2776        assert_eq!(seq.blocks.len(), 2);
2777
2778        // Pop 9
2779        assert_eq!(seq.pop(), Some(9));
2780        assert_eq!(seq.total_tokens(), 8);
2781        assert!(seq.current_block.is_empty());
2782        assert_eq!(seq.blocks.len(), 2);
2783        assert_eq!(seq.current_block.parent_sequence_hash, Some(SEQ_HASH_5_8));
2784
2785        // Pop 8 (crosses boundary)
2786        assert_eq!(seq.pop(), Some(8));
2787        assert_eq!(seq.total_tokens(), 7);
2788        assert_eq!(seq.current_block.tokens.as_ref(), &[5, 6, 7]);
2789        assert_eq!(seq.blocks.len(), 1);
2790        assert_eq!(seq.current_block.parent_sequence_hash, Some(SEQ_HASH_1_4));
2791
2792        // Pop remaining partial (7, 6, 5)
2793        assert_eq!(seq.pop(), Some(7));
2794        assert_eq!(seq.pop(), Some(6));
2795        assert_eq!(seq.pop(), Some(5));
2796        assert_eq!(seq.total_tokens(), 4);
2797        assert!(seq.current_block.is_empty());
2798        assert_eq!(seq.blocks.len(), 1);
2799        assert_eq!(seq.current_block.parent_sequence_hash, Some(SEQ_HASH_1_4));
2800
2801        // Pop 4 (crosses boundary)
2802        assert_eq!(seq.pop(), Some(4));
2803        assert_eq!(seq.total_tokens(), 3);
2804        assert_eq!(seq.current_block.tokens.as_ref(), &[1, 2, 3]);
2805        assert!(seq.blocks.is_empty());
2806        assert_eq!(seq.current_block.parent_sequence_hash, None);
2807
2808        // Pop 3, 2, 1
2809        assert_eq!(seq.pop(), Some(3));
2810        assert_eq!(seq.pop(), Some(2));
2811        assert_eq!(seq.pop(), Some(1));
2812        assert_eq!(seq.total_tokens(), 0);
2813        assert!(seq.current_block.is_empty());
2814        assert!(seq.blocks.is_empty());
2815
2816        // Pop from empty
2817        assert_eq!(seq.pop(), None);
2818        assert_eq!(seq.total_tokens(), 0);
2819    }
2820
2821    #[test]
2822    fn test_total_tokens() {
2823        let block_size = 3;
2824        let salt_hash = Some(TEST_SALT_HASH);
2825
2826        let mut seq = create_test_sequence(&[], block_size, salt_hash);
2827        assert_eq!(seq.total_tokens(), 0);
2828
2829        seq.extend(Tokens::from(vec![1, 2])).unwrap();
2830        assert_eq!(seq.total_tokens(), 2);
2831
2832        seq.append(3).unwrap(); // Completes block 0
2833        assert_eq!(seq.total_tokens(), 3);
2834
2835        seq.extend(Tokens::from(vec![4, 5, 6, 7])).unwrap(); // Completes block 1, partial [7]
2836        assert_eq!(seq.total_tokens(), 7);
2837
2838        seq.pop().unwrap(); // Removes 7
2839        assert_eq!(seq.total_tokens(), 6);
2840
2841        seq.truncate(4).unwrap(); // Keep [1,2,3,4]
2842        assert_eq!(seq.total_tokens(), 4);
2843
2844        seq.unwind(2).unwrap(); // Keep [1,2]
2845        assert_eq!(seq.total_tokens(), 2);
2846    }
2847
2848    #[test]
2849    fn test_push_tokens_partial_block() {
2850        let mut partial = PartialTokenBlock::create_sequence_root(4, 1337);
2851
2852        let tokens = Tokens(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
2853
2854        let remaining = partial.push_tokens(tokens);
2855        assert_eq!(partial.tokens.len(), 4);
2856        assert_eq!(remaining.len(), 6);
2857    }
2858
2859    // ========== Additional tests for coverage improvement ==========
2860
2861    // === PositionalRadixTree Tests ===
2862
2863    #[test]
2864    fn test_positional_radix_tree_basic_operations() {
2865        use crate::PositionalRadixTree;
2866
2867        // Test new() and is_empty()
2868        let tree: PositionalRadixTree<String> = PositionalRadixTree::new();
2869        assert!(tree.is_empty());
2870        assert_eq!(tree.len(), 0);
2871
2872        // Test default()
2873        let tree2: PositionalRadixTree<i32> = PositionalRadixTree::default();
2874        assert!(tree2.is_empty());
2875
2876        // Test prefix() and insertion
2877        let psh1 = PositionalSequenceHash::new(0x1234, 0, 0xABCD);
2878        let psh2 = PositionalSequenceHash::new(0x5678, 0, 0xEF01);
2879        let psh3 = PositionalSequenceHash::new(0x9ABC, 1, 0x2345);
2880
2881        tree.prefix(&psh1).insert(psh1, "value1".to_string());
2882        assert!(!tree.is_empty());
2883        assert_eq!(tree.len(), 1);
2884
2885        tree.prefix(&psh2).insert(psh2, "value2".to_string());
2886        assert_eq!(tree.len(), 2);
2887
2888        tree.prefix(&psh3).insert(psh3, "value3".to_string());
2889        assert_eq!(tree.len(), 3);
2890
2891        // Test retrieval
2892        assert_eq!(
2893            tree.prefix(&psh1).get(&psh1).map(|v| v.clone()),
2894            Some("value1".to_string())
2895        );
2896    }
2897
2898    #[test]
2899    fn test_positional_radix_tree_with_lineage_hash() {
2900        use crate::PositionalRadixTree;
2901
2902        // Test generic usage with PositionalLineageHash
2903        let tree: PositionalRadixTree<u32, PositionalLineageHash> = PositionalRadixTree::new();
2904        assert!(tree.is_empty());
2905
2906        let plh1 = PositionalLineageHash::new(0x1234, None, 0);
2907        let plh2 = PositionalLineageHash::new(0x5678, Some(0x1234), 1);
2908
2909        tree.prefix(&plh1).insert(plh1, 100);
2910        tree.prefix(&plh2).insert(plh2, 200);
2911
2912        assert_eq!(tree.len(), 2);
2913        assert_eq!(tree.prefix(&plh1).get(&plh1).map(|v| *v), Some(100));
2914        assert_eq!(tree.prefix(&plh2).get(&plh2).map(|v| *v), Some(200));
2915    }
2916
2917    #[test]
2918    fn test_positional_radix_tree_position_lookup() {
2919        use crate::PositionalRadixTree;
2920
2921        let tree: PositionalRadixTree<String> = PositionalRadixTree::new();
2922
2923        // Insert at different positions
2924        let psh0 = PositionalSequenceHash::new(0x1111, 0, 0xAAAA);
2925        let psh1 = PositionalSequenceHash::new(0x2222, 1, 0xBBBB);
2926        let psh2 = PositionalSequenceHash::new(0x3333, 2, 0xCCCC);
2927
2928        tree.prefix(&psh0).insert(psh0, "pos0".to_string());
2929        tree.prefix(&psh1).insert(psh1, "pos1".to_string());
2930        tree.prefix(&psh2).insert(psh2, "pos2".to_string());
2931
2932        // Test position() method
2933        assert!(tree.position(0).is_some());
2934        assert!(tree.position(1).is_some());
2935        assert!(tree.position(2).is_some());
2936        assert!(tree.position(3).is_none()); // No entries at position 3
2937
2938        // Verify position lookup returns correct submap
2939        let pos0_map = tree.position(0).unwrap();
2940        assert_eq!(pos0_map.len(), 1);
2941    }
2942
2943    // === PositionalSequenceHash Additional Tests ===
2944
2945    #[test]
2946    fn test_positional_sequence_hash_mode_2_and_3() {
2947        // Mode 2: position fits in 24 bits (65536 <= pos < 16777216)
2948        let position_mode2 = 100_000u64;
2949        let seq_hash = 0x1234567890ABCDEF;
2950        let block_hash = 0xFEDCBA9876543210;
2951
2952        let psh_mode2 = PositionalSequenceHash::new(seq_hash, position_mode2, block_hash);
2953        assert_eq!(psh_mode2.mode(), 2, "Position 100,000 should use mode 2");
2954        assert_eq!(psh_mode2.position(), position_mode2);
2955        assert_eq!(psh_mode2.sequence_hash(), seq_hash);
2956        // Local block hash truncated to 38 bits in mode 2
2957        assert_eq!(
2958            psh_mode2.local_block_hash(),
2959            block_hash & ((1u64 << 38) - 1)
2960        );
2961
2962        // Mode 3: position fits in 31 bits (16777216 <= pos < 2147483648)
2963        let position_mode3 = 100_000_000u64;
2964        let psh_mode3 = PositionalSequenceHash::new(seq_hash, position_mode3, block_hash);
2965        assert_eq!(
2966            psh_mode3.mode(),
2967            3,
2968            "Position 100,000,000 should use mode 3"
2969        );
2970        assert_eq!(psh_mode3.position(), position_mode3);
2971        assert_eq!(psh_mode3.sequence_hash(), seq_hash);
2972        // Local block hash truncated to 31 bits in mode 3
2973        assert_eq!(
2974            psh_mode3.local_block_hash(),
2975            block_hash & ((1u64 << 31) - 1)
2976        );
2977    }
2978
2979    #[test]
2980    fn test_positional_sequence_hash_as_u128() {
2981        let psh = PositionalSequenceHash::new(0x1234, 100, 0xABCD);
2982        let raw = psh.as_u128();
2983
2984        // Verify we can reconstruct from raw value
2985        assert_eq!(raw & 0xFFFF_FFFF_FFFF_FFFF, 0x1234);
2986        assert!(raw > 0); // Non-zero
2987
2988        // Create another and compare
2989        let psh2 = PositionalSequenceHash::new(0x1234, 100, 0xABCD);
2990        assert_eq!(psh.as_u128(), psh2.as_u128());
2991    }
2992
2993    #[test]
2994    fn test_positional_sequence_hash_debug() {
2995        let psh = PositionalSequenceHash::new(0x1234567890ABCDEF, 42, 0xFEDCBA98);
2996        let debug_str = format!("{:?}", psh);
2997
2998        // Debug should contain field names and values
2999        assert!(debug_str.contains("PositionalSequenceHash"));
3000        assert!(debug_str.contains("sequence_hash"));
3001        assert!(debug_str.contains("local_block_hash"));
3002        assert!(debug_str.contains("position"));
3003    }
3004
3005    // === PositionalLineageHash Additional Tests ===
3006
3007    #[test]
3008    fn test_positional_lineage_hash_debug_and_display() {
3009        // Test position 0 (no parent shown)
3010        let plh_root = PositionalLineageHash::new(0x123456789ABCDEF0, None, 0);
3011        let debug_root = format!("{:?}", plh_root);
3012        let display_root = format!("{}", plh_root);
3013
3014        // Debug and Display should show position 0
3015        assert!(debug_root.starts_with("0:"));
3016        assert!(display_root.starts_with("0:"));
3017        // Position 0 should not show parent
3018        assert_eq!(debug_root.matches(':').count(), 1);
3019        assert_eq!(display_root.matches(':').count(), 1);
3020
3021        // Test position > 0 (parent shown)
3022        let plh_child = PositionalLineageHash::new(0xABCDEF0123456789, Some(0x123456789ABCDEF0), 5);
3023        let debug_child = format!("{:?}", plh_child);
3024        let display_child = format!("{}", plh_child);
3025
3026        // Should show position:current:parent
3027        assert!(debug_child.starts_with("5:"));
3028        assert!(display_child.starts_with("5:"));
3029        // Position > 0 should show parent (3 parts)
3030        assert_eq!(debug_child.matches(':').count(), 2);
3031        assert_eq!(display_child.matches(':').count(), 2);
3032    }
3033
3034    #[test]
3035    fn test_positional_lineage_hash_as_u128() {
3036        let plh = PositionalLineageHash::new(0x1234, Some(0x5678), 10);
3037        let raw = plh.as_u128();
3038
3039        assert!(raw > 0);
3040
3041        // Create another with same params and compare
3042        let plh2 = PositionalLineageHash::new(0x1234, Some(0x5678), 10);
3043        assert_eq!(plh.as_u128(), plh2.as_u128());
3044
3045        // Different params should give different hash
3046        let plh3 = PositionalLineageHash::new(0x1234, Some(0x5678), 11);
3047        assert_ne!(plh.as_u128(), plh3.as_u128());
3048    }
3049
3050    #[test]
3051    fn test_positional_lineage_hash_ord_by_position_then_current_fragment() {
3052        let at_5_low = PositionalLineageHash::new(0x10, Some(0x1111), 5);
3053        let at_5_high = PositionalLineageHash::new(0x20, Some(0x1111), 5);
3054        assert!(
3055            at_5_low.current_sequence_hash() < at_5_high.current_sequence_hash(),
3056            "test assumes distinct current sequence hashes at the same position"
3057        );
3058        assert!(at_5_low < at_5_high);
3059        assert!(at_5_high > at_5_low);
3060
3061        let at_3 = PositionalLineageHash::new(0x99, Some(0x2222), 3);
3062        assert!(at_3 < at_5_low);
3063        assert!(at_5_high < PositionalLineageHash::new(0x01, Some(0x3333), 6));
3064    }
3065
3066    #[test]
3067    fn test_positional_lineage_hash_ord_tiebreak_parent_via_packed_u128() {
3068        let same_pos_same_current = PositionalLineageHash::new(0x1234, Some(0x100), 10);
3069        let same_pos_same_current_other_parent =
3070            PositionalLineageHash::new(0x1234, Some(0x200), 10);
3071        assert_eq!(same_pos_same_current.position(), 10);
3072        assert_eq!(
3073            same_pos_same_current.position(),
3074            same_pos_same_current_other_parent.position()
3075        );
3076        assert_eq!(
3077            same_pos_same_current.current_sequence_hash(),
3078            same_pos_same_current_other_parent.current_sequence_hash()
3079        );
3080        assert_ne!(same_pos_same_current, same_pos_same_current_other_parent);
3081        assert_ne!(
3082            same_pos_same_current.cmp(&same_pos_same_current_other_parent),
3083            std::cmp::Ordering::Equal
3084        );
3085    }
3086
3087    #[test]
3088    fn test_positional_lineage_hash_vec_sort_matches_ord() {
3089        let a = PositionalLineageHash::new(0x30, None, 0);
3090        let b = PositionalLineageHash::new(0x10, Some(0x30), 2);
3091        let c = PositionalLineageHash::new(0x20, Some(0x30), 2);
3092        let mut v = vec![b, a, c];
3093        v.sort();
3094        assert_eq!(v, vec![a, b, c]);
3095    }
3096
3097    #[test]
3098    fn test_positional_lineage_hash_itertools_sorted() {
3099        use itertools::Itertools;
3100
3101        let a = PositionalLineageHash::new(0x30, None, 0);
3102        let b = PositionalLineageHash::new(0x10, Some(0x30), 2);
3103        let c = PositionalLineageHash::new(0x20, Some(0x30), 2);
3104        let sorted: Vec<_> = vec![b, a, c].into_iter().sorted().collect();
3105        assert_eq!(sorted, vec![a, b, c]);
3106    }
3107
3108    // === Tokens From Impls ===
3109
3110    #[test]
3111    fn test_tokens_from_vec_usize() {
3112        let usize_vec: Vec<usize> = vec![1, 2, 3, 4, 5];
3113        let tokens = Tokens::from(usize_vec);
3114
3115        assert_eq!(tokens.as_ref(), &[1u32, 2, 3, 4, 5]);
3116        assert_eq!(tokens.len(), 5);
3117    }
3118
3119    #[test]
3120    fn test_tokens_partial_eq_slice_ref() {
3121        let tokens = Tokens::from(vec![1u32, 2, 3, 4]);
3122        let slice: &[Token] = &[1, 2, 3, 4];
3123
3124        // Test PartialEq<&[Token]> for Tokens
3125        assert!(tokens == slice);
3126
3127        let different_slice: &[Token] = &[1, 2, 3, 5];
3128        assert!(tokens != different_slice);
3129    }
3130
3131    // === TokenBlock Accessors ===
3132
3133    #[test]
3134    fn test_token_block_accessors() {
3135        let tokens = Tokens::from(vec![1u32, 2, 3, 4]);
3136        let seq = TokenBlockSequence::new(tokens, 4, Some(1337));
3137
3138        let block = &seq.blocks()[0];
3139
3140        // Test block_size()
3141        assert_eq!(block.block_size(), 4);
3142
3143        // Test positional_sequence_hash()
3144        let psh = block.positional_sequence_hash();
3145        assert_eq!(psh.position(), 0);
3146
3147        // Test positional_lineage_hash()
3148        let plh = block.positional_lineage_hash();
3149        assert_eq!(plh.position(), 0);
3150        assert_eq!(plh.parent_hash_fragment(), 0); // Root has no parent
3151    }
3152
3153    #[test]
3154    fn test_positional_hash_trait_impls() {
3155        use crate::PositionalHash;
3156
3157        // Test PositionalHash for PositionalSequenceHash
3158        let psh = PositionalSequenceHash::new(0x1234, 42, 0xABCD);
3159        assert_eq!(PositionalHash::position(&psh), 42);
3160
3161        // Test PositionalHash for PositionalLineageHash
3162        let plh = PositionalLineageHash::new(0x1234, None, 99);
3163        assert_eq!(PositionalHash::position(&plh), 99);
3164    }
3165
3166    // === TokenBlockSequence Edge Cases ===
3167
3168    #[test]
3169    fn test_sequence_pop_from_full_block() {
3170        // Test pop when current partial block is empty (must pop from full block)
3171        let tokens = Tokens::from(vec![1u32, 2, 3, 4, 5, 6, 7, 8]);
3172        let mut seq = TokenBlockSequence::new(tokens, 4, Some(TEST_SALT_HASH));
3173
3174        // Current block should be empty, all tokens in completed blocks
3175        assert!(seq.current_block().is_empty());
3176        assert_eq!(seq.blocks().len(), 2);
3177        assert_eq!(seq.total_tokens(), 8);
3178
3179        // Pop should remove from last full block
3180        let popped = seq.pop();
3181        assert_eq!(popped, Some(8));
3182        assert_eq!(seq.total_tokens(), 7);
3183        assert_eq!(seq.blocks().len(), 1);
3184        assert_eq!(seq.current_block().tokens.as_ref(), &[5, 6, 7]);
3185    }
3186
3187    #[test]
3188    #[allow(clippy::reversed_empty_ranges)] // so we can explicitly test invalid ranges
3189    fn test_sequence_tokens_at_edge_cases() {
3190        let tokens = Tokens::from(vec![1u32, 2, 3, 4, 5]);
3191        let seq = TokenBlockSequence::new(tokens, 4, Some(TEST_SALT_HASH));
3192
3193        // Start > end (invalid range
3194        assert!(seq.tokens_at(3..2).is_empty());
3195
3196        // End > total (out of bounds)
3197        assert!(seq.tokens_at(0..10).is_empty());
3198
3199        // Valid edge case: exact boundaries
3200        assert_eq!(seq.tokens_at(0..4).as_ref(), &[1, 2, 3, 4]);
3201        assert_eq!(seq.tokens_at(4..5).as_ref(), &[5]);
3202    }
3203
3204    #[test]
3205    fn test_sequence_next_block() {
3206        let tokens = Tokens::from(vec![1u32, 2, 3, 4]);
3207        let seq = TokenBlockSequence::new(tokens, 4, Some(1337));
3208
3209        let block = &seq.blocks()[0];
3210        let next_partial = block.next_block();
3211
3212        // next_block should create a partial block linked to this block
3213        assert!(next_partial.is_empty());
3214        assert_eq!(next_partial.remaining(), 4);
3215        assert_eq!(
3216            next_partial.parent_sequence_hash,
3217            Some(block.sequence_hash())
3218        );
3219        assert_eq!(next_partial.position, 1);
3220    }
3221
3222    #[test]
3223    fn test_sequence_reset() {
3224        let tokens = Tokens::from(vec![1u32, 2, 3, 4, 5, 6, 7, 8, 9]);
3225        let mut seq = TokenBlockSequence::new(tokens, 4, Some(1337));
3226
3227        assert_eq!(seq.blocks().len(), 2);
3228        assert_eq!(seq.total_tokens(), 9);
3229
3230        seq.reset();
3231
3232        assert!(seq.blocks().is_empty());
3233        assert!(seq.current_block().is_empty());
3234        assert_eq!(seq.total_tokens(), 0);
3235        assert_eq!(seq.current_block().parent_sequence_hash, None);
3236    }
3237
3238    #[test]
3239    fn test_sequence_into_parts() {
3240        let tokens = Tokens::from(vec![1u32, 2, 3, 4, 5]);
3241        let seq = TokenBlockSequence::new(tokens, 4, Some(1337));
3242
3243        let (blocks, partial) = seq.into_parts();
3244
3245        assert_eq!(blocks.len(), 1);
3246        assert_eq!(partial.tokens.as_ref(), &[5]);
3247    }
3248
3249    #[test]
3250    fn test_sequence_last_complete_block() {
3251        // Empty sequence
3252        let seq_empty = TokenBlockSequence::new(Tokens::default(), 4, None);
3253        assert!(seq_empty.last_complete_block().is_none());
3254
3255        // With blocks
3256        let tokens = Tokens::from(vec![1u32, 2, 3, 4, 5, 6, 7, 8]);
3257        let seq = TokenBlockSequence::new(tokens, 4, Some(1337));
3258        let last = seq.last_complete_block();
3259        assert!(last.is_some());
3260        assert_eq!(last.unwrap().tokens().as_ref(), &[5, 6, 7, 8]);
3261    }
3262
3263    #[test]
3264    fn test_positional_hashes_msgpack_roundtrip() {
3265        let psh = PositionalSequenceHash::new(0xDEAD_BEEF_CAFE_BABE, 12345, 0x0123_4567_89AB_CDEF);
3266        let bytes = rmp_serde::to_vec(&psh).expect("psh serialize");
3267        let decoded: PositionalSequenceHash =
3268            rmp_serde::from_slice(&bytes).expect("psh deserialize");
3269        assert_eq!(psh, decoded);
3270        assert_eq!(psh.as_u128(), decoded.as_u128());
3271
3272        let plh =
3273            PositionalLineageHash::new(0x1111_2222_3333_4444, Some(0x5555_6666_7777_8888), 256);
3274        let bytes = rmp_serde::to_vec(&plh).expect("plh serialize");
3275        let decoded: PositionalLineageHash =
3276            rmp_serde::from_slice(&bytes).expect("plh deserialize");
3277        assert_eq!(plh, decoded);
3278        assert_eq!(plh.as_u128(), decoded.as_u128());
3279
3280        // Vec roundtrip — exercises the codec inside a container.
3281        let vec = vec![psh, PositionalSequenceHash::default(), psh];
3282        let bytes = rmp_serde::to_vec(&vec).expect("vec serialize");
3283        let decoded: Vec<PositionalSequenceHash> =
3284            rmp_serde::from_slice(&bytes).expect("vec deserialize");
3285        assert_eq!(vec, decoded);
3286    }
3287
3288    #[test]
3289    fn test_positional_hashes_json_roundtrip() {
3290        // Confirm the byte-array codec also roundtrips through JSON (array of u8).
3291        let psh = PositionalSequenceHash::new(0xAAAA_BBBB_CCCC_DDDD, 7, 0xEEEE_FFFF_0000_1111);
3292        let json = serde_json::to_string(&psh).expect("psh json serialize");
3293        let decoded: PositionalSequenceHash =
3294            serde_json::from_str(&json).expect("psh json deserialize");
3295        assert_eq!(psh, decoded);
3296
3297        let plh = PositionalLineageHash::new(0x1234_5678, Some(0xABCD_EF01), 42);
3298        let json = serde_json::to_string(&plh).expect("plh json serialize");
3299        let decoded: PositionalLineageHash =
3300            serde_json::from_str(&json).expect("plh json deserialize");
3301        assert_eq!(plh, decoded);
3302    }
3303
3304    // ----------------------------------------------------------------------------------------
3305    // Multimodal block-formation tests (#10–14 in the kv-hashing plan).
3306    // ----------------------------------------------------------------------------------------
3307
3308    /// #10: a sequence built via `new_with_mm` with empty mm_info must equal one built via `new`.
3309    #[test]
3310    fn tokens_mm_zero_mm_equivalence() {
3311        let tokens = Tokens::from(vec![1u32, 2, 3, 4, 5, 6, 7, 8, 9]);
3312        let baseline = TokenBlockSequence::new(tokens.clone(), 4, Some(TEST_SALT_HASH));
3313        let mm = TokenBlockSequence::new_with_mm(tokens, &[], 4, Some(TEST_SALT_HASH))
3314            .expect("validation should pass for empty mm_info");
3315
3316        assert_eq!(mm.blocks().len(), baseline.blocks().len());
3317        for (a, b) in mm.blocks().iter().zip(baseline.blocks().iter()) {
3318            assert_eq!(a.salt_hash(), b.salt_hash());
3319            assert_eq!(a.block_hash(), b.block_hash());
3320            assert_eq!(a.sequence_hash(), b.sequence_hash());
3321            assert_eq!(a.parent_sequence_hash(), b.parent_sequence_hash());
3322            assert_eq!(a.positional_lineage_hash(), b.positional_lineage_hash());
3323        }
3324        assert!(mm.mm_runs().is_empty());
3325    }
3326
3327    /// #11: byte layout — verify the MM-aware buffer matches the documented
3328    /// 13-bytes-per-slot tagged framing, and that block_hash is XXH3 over that exact buffer.
3329    #[test]
3330    fn tokens_mm_byte_layout() {
3331        // Block 0: tokens [t0..t3], placeholder run [4..6) with mm_hash=0xAA, then t6, t7.
3332        // block_size = 8 ⇒ block_offset = 0, run covers slots 4..6. The block is MM-affected
3333        // ⇒ tagged 13-byte frames apply to *every* slot.
3334        let tokens = Tokens::from(vec![100u32, 101, 102, 103, 0, 0, 106, 107]);
3335        let mm = vec![TokenBlockMmInfo {
3336            mm_hash: 0xAAu64,
3337            offset: 4,
3338            length: 2,
3339        }];
3340        let salt = TEST_SALT_HASH;
3341
3342        // Build expected bytes manually: each slot is 13 bytes.
3343        let mut expected = Vec::new();
3344        for &t in &[100u32, 101, 102, 103] {
3345            expected.push(MM_SLOT_TAG_TOKEN);
3346            expected.extend_from_slice(&t.to_le_bytes());
3347            expected.extend_from_slice(&0u64.to_le_bytes());
3348        }
3349        for run_off in 0u32..2 {
3350            expected.push(MM_SLOT_TAG_PLACEHOLDER);
3351            expected.extend_from_slice(&run_off.to_le_bytes());
3352            expected.extend_from_slice(&0xAAu64.to_le_bytes());
3353        }
3354        for &t in &[106u32, 107] {
3355            expected.push(MM_SLOT_TAG_TOKEN);
3356            expected.extend_from_slice(&t.to_le_bytes());
3357            expected.extend_from_slice(&0u64.to_le_bytes());
3358        }
3359        assert_eq!(expected.len(), 8 * 13);
3360
3361        // Validate helper output matches.
3362        let helper_bytes = compute_block_bytes_with_mm(&tokens, 0, &mm);
3363        assert_eq!(helper_bytes, expected, "MM-aware byte buffer mismatch");
3364
3365        // Validate block_hash equals XXH3 over the expected buffer.
3366        let expected_block_hash = compute_block_hash(&expected, salt);
3367        let seq = TokenBlockSequence::new_with_mm(tokens, &mm, 8, Some(salt)).unwrap();
3368        assert_eq!(seq.blocks().len(), 1);
3369        assert_eq!(seq.blocks()[0].block_hash(), expected_block_hash);
3370    }
3371
3372    /// #11b — collision regression. Reviewer P1: with the original 4/12 mixed encoding,
3373    /// `block_size=2` blocks `[MM(slot 0), token]` and `[token, MM(slot 1)]` could produce
3374    /// identical byte streams under chosen `mm_hash`/token values. With tagged 13-byte
3375    /// framing they MUST differ.
3376    #[test]
3377    fn tokens_mm_no_position_collision() {
3378        let salt = TEST_SALT_HASH;
3379        // Layout A: block_size=2, MM at slot 0, token at slot 1.
3380        let tokens_a = Tokens::from(vec![0u32, 0xAB]);
3381        let mm_a = vec![TokenBlockMmInfo {
3382            mm_hash: 0x1122_3344_5566_7788,
3383            offset: 0,
3384            length: 1,
3385        }];
3386        // Layout B: block_size=2, token at slot 0, MM at slot 1.
3387        let tokens_b = Tokens::from(vec![0xAB, 0u32]);
3388        let mm_b = vec![TokenBlockMmInfo {
3389            mm_hash: 0x1122_3344_5566_7788,
3390            offset: 1,
3391            length: 1,
3392        }];
3393
3394        let bytes_a = compute_block_bytes_with_mm(&tokens_a, 0, &mm_a);
3395        let bytes_b = compute_block_bytes_with_mm(&tokens_b, 0, &mm_b);
3396        assert_ne!(
3397            bytes_a, bytes_b,
3398            "tagged framing must distinguish slot kinds at different positions"
3399        );
3400
3401        let seq_a = TokenBlockSequence::new_with_mm(tokens_a, &mm_a, 2, Some(salt)).unwrap();
3402        let seq_b = TokenBlockSequence::new_with_mm(tokens_b, &mm_b, 2, Some(salt)).unwrap();
3403        assert_ne!(
3404            seq_a.blocks()[0].block_hash(),
3405            seq_b.blocks()[0].block_hash()
3406        );
3407    }
3408
3409    /// #11c — per-block legacy fallback. A block with no overlapping MM run uses the legacy
3410    /// 4-byte-per-slot encoding so its `block_hash` matches the existing zero-MM path. Block 0
3411    /// of an MM-bearing sequence (run starts in block 1) must equal block 0 of a no-MM sequence.
3412    #[test]
3413    fn tokens_mm_legacy_fallback_per_block() {
3414        let block_size: u32 = 4;
3415        let salt = Some(TEST_SALT_HASH);
3416        let raw = vec![1u32, 2, 3, 4, 5, 6, 7, 8];
3417        // MM run covers only block 1 (positions [4..7)).
3418        let mm = vec![TokenBlockMmInfo {
3419            mm_hash: 0xAB,
3420            offset: 4,
3421            length: 3,
3422        }];
3423        let seq_mm =
3424            TokenBlockSequence::new_with_mm(Tokens::from(raw.clone()), &mm, block_size, salt)
3425                .unwrap();
3426        let seq_plain = TokenBlockSequence::new(Tokens::from(raw), block_size, salt);
3427
3428        // Block 0 untouched by MM ⇒ identical hashes.
3429        assert_eq!(
3430            seq_mm.blocks()[0].block_hash(),
3431            seq_plain.blocks()[0].block_hash()
3432        );
3433        assert_eq!(
3434            seq_mm.blocks()[0].sequence_hash(),
3435            seq_plain.blocks()[0].sequence_hash()
3436        );
3437        // Block 1 IS MM-affected ⇒ hashes diverge.
3438        assert_ne!(
3439            seq_mm.blocks()[1].block_hash(),
3440            seq_plain.blocks()[1].block_hash()
3441        );
3442    }
3443
3444    /// #11d — `offset + length` overflow is rejected as a dedicated error variant rather
3445    /// than panicking or silently wrapping.
3446    #[test]
3447    fn tokens_mm_validation_overflow() {
3448        let bad = vec![TokenBlockMmInfo {
3449            mm_hash: 1,
3450            offset: usize::MAX - 2,
3451            length: 10,
3452        }];
3453        let err = validate_and_sort_mm_info(&bad, usize::MAX).expect_err("must reject overflow");
3454        assert!(matches!(err, MmInfoError::OffsetOverflow { .. }));
3455    }
3456
3457    /// #12: building incrementally via push_token / push_mm_run yields the same sequence
3458    /// as the batch `new_with_mm` constructor.
3459    #[test]
3460    fn tokens_mm_streaming_equals_batch() {
3461        // Layout: [t,t,t,(MM=0xAA len=4),t,t] over block_size=4 ⇒ 2 blocks + partial.
3462        let tokens = Tokens::from(vec![1u32, 2, 3, 0, 0, 0, 0, 6, 7]);
3463        let mm = vec![TokenBlockMmInfo {
3464            mm_hash: 0xAAu64,
3465            offset: 3,
3466            length: 4,
3467        }];
3468        let salt = Some(TEST_SALT_HASH);
3469        let batch = TokenBlockSequence::new_with_mm(tokens, &mm, 4, salt).unwrap();
3470
3471        let mut streamed = TokenBlockSequence::new(Tokens::default(), 4, salt);
3472        streamed.push_token(1).unwrap();
3473        streamed.push_token(2).unwrap();
3474        streamed.push_token(3).unwrap();
3475        streamed.push_mm_run(0xAAu64, 4).unwrap();
3476        streamed.push_token(6).unwrap();
3477        streamed.push_token(7).unwrap();
3478
3479        assert_eq!(streamed.blocks().len(), batch.blocks().len());
3480        for (a, b) in streamed.blocks().iter().zip(batch.blocks().iter()) {
3481            assert_eq!(a.block_hash(), b.block_hash(), "block_hash mismatch");
3482            assert_eq!(a.sequence_hash(), b.sequence_hash(), "seq_hash mismatch");
3483            assert_eq!(
3484                a.positional_lineage_hash(),
3485                b.positional_lineage_hash(),
3486                "PLH mismatch"
3487            );
3488        }
3489        assert_eq!(streamed.mm_runs(), batch.mm_runs());
3490    }
3491
3492    /// #13: a multi-block MM run produces distinct block_hashes for blocks fully covered by
3493    /// the run (run_offset increases monotonically), and shares prefix hashes with another
3494    /// request whose run starts at the same global offset.
3495    #[test]
3496    fn tokens_mm_multi_block_run() {
3497        let block_size: u32 = 8;
3498        let bs = block_size as usize;
3499        // Run of length 2*bs + k = 20 starting at the boundary of block 0 ⇒ spans blocks 0,1,2.
3500        // Blocks 0 and 1 are fully placeholders; block 2 starts as 4 placeholders + 4 reals.
3501        let mut tokens_a: Vec<Token> = vec![0u32; 2 * bs]; // blocks 0, 1 (placeholders)
3502        tokens_a.extend_from_slice(&[0u32, 0, 0, 0, 100, 101, 102, 103]); // block 2
3503        let tokens_a = Tokens::from(tokens_a);
3504        let mm = vec![TokenBlockMmInfo {
3505            mm_hash: 0xCAFEBABEu64,
3506            offset: 0,
3507            length: 20,
3508        }];
3509        let seq_a = TokenBlockSequence::new_with_mm(
3510            tokens_a.clone(),
3511            &mm,
3512            block_size,
3513            Some(TEST_SALT_HASH),
3514        )
3515        .unwrap();
3516        assert_eq!(seq_a.blocks().len(), 3);
3517
3518        // Block 0 and block 1 are *both* fully placeholder, but with different run_offsets
3519        // (0..7 vs 8..15) → block_hashes must differ.
3520        let bh0 = seq_a.blocks()[0].block_hash();
3521        let bh1 = seq_a.blocks()[1].block_hash();
3522        assert_ne!(
3523            bh0, bh1,
3524            "fully-placeholder blocks at different run_offsets must hash differently"
3525        );
3526
3527        // Same image at the same global starting position in another request must share blocks.
3528        let seq_b =
3529            TokenBlockSequence::new_with_mm(tokens_a, &mm, block_size, Some(TEST_SALT_HASH))
3530                .unwrap();
3531        assert_eq!(
3532            seq_a.blocks()[0].block_hash(),
3533            seq_b.blocks()[0].block_hash()
3534        );
3535        assert_eq!(
3536            seq_a.blocks()[1].block_hash(),
3537            seq_b.blocks()[1].block_hash()
3538        );
3539        assert_eq!(
3540            seq_a.blocks()[2].block_hash(),
3541            seq_b.blocks()[2].block_hash()
3542        );
3543
3544        // A different mm_hash at the same position must diverge starting at block 0.
3545        let mm_diff = vec![TokenBlockMmInfo {
3546            mm_hash: 0xDEADBEEFu64,
3547            offset: 0,
3548            length: 20,
3549        }];
3550        let mut tokens_c: Vec<Token> = vec![0u32; 2 * bs];
3551        tokens_c.extend_from_slice(&[0u32, 0, 0, 0, 100, 101, 102, 103]);
3552        let seq_c = TokenBlockSequence::new_with_mm(
3553            Tokens::from(tokens_c),
3554            &mm_diff,
3555            block_size,
3556            Some(TEST_SALT_HASH),
3557        )
3558        .unwrap();
3559        assert_ne!(
3560            seq_a.blocks()[0].block_hash(),
3561            seq_c.blocks()[0].block_hash()
3562        );
3563    }
3564
3565    /// #14: mm_info validation rejects overlap, out-of-bounds, and zero-length runs.
3566    #[test]
3567    fn tokens_mm_validation() {
3568        let tokens = Tokens::from(vec![0u32; 32]);
3569        // Overlap.
3570        let overlap = vec![
3571            TokenBlockMmInfo {
3572                mm_hash: 1,
3573                offset: 0,
3574                length: 5,
3575            },
3576            TokenBlockMmInfo {
3577                mm_hash: 2,
3578                offset: 4,
3579                length: 5,
3580            },
3581        ];
3582        let err = TokenBlockSequence::new_with_mm(tokens.clone(), &overlap, 4, None).unwrap_err();
3583        assert!(matches!(
3584            err,
3585            TokenBlockError::MmInfo(MmInfoError::Overlapping { .. })
3586        ));
3587
3588        // Out-of-bounds.
3589        let oob = vec![TokenBlockMmInfo {
3590            mm_hash: 1,
3591            offset: 30,
3592            length: 10,
3593        }];
3594        let err = TokenBlockSequence::new_with_mm(tokens.clone(), &oob, 4, None).unwrap_err();
3595        assert!(matches!(
3596            err,
3597            TokenBlockError::MmInfo(MmInfoError::OutOfBounds { .. })
3598        ));
3599
3600        // Zero-length run.
3601        let empty = vec![TokenBlockMmInfo {
3602            mm_hash: 1,
3603            offset: 0,
3604            length: 0,
3605        }];
3606        let err = TokenBlockSequence::new_with_mm(tokens, &empty, 4, None).unwrap_err();
3607        assert!(matches!(
3608            err,
3609            TokenBlockError::MmInfo(MmInfoError::EmptyRun)
3610        ));
3611
3612        // push_mm_run with length 0.
3613        let mut seq = TokenBlockSequence::new(Tokens::default(), 4, None);
3614        let err = seq.push_mm_run(0xAB, 0).unwrap_err();
3615        assert!(matches!(
3616            err,
3617            TokenBlockError::MmInfo(MmInfoError::EmptyRun)
3618        ));
3619
3620        // truncate / pop / unwind blocked once mm_runs is non-empty.
3621        let mut seq = TokenBlockSequence::new(Tokens::from(vec![1u32, 2, 3]), 4, None);
3622        seq.push_mm_run(0xAB, 2).unwrap();
3623        assert!(matches!(
3624            seq.truncate(0).unwrap_err(),
3625            TokenBlockError::MmRunsPresent
3626        ));
3627        assert!(matches!(
3628            seq.unwind(1).unwrap_err(),
3629            TokenBlockError::MmRunsPresent
3630        ));
3631    }
3632}
dynamo_tokens/lib.rs

dynamo_tokens/
lib.rs