Skip to main content

cjc_data/
byte_dict.rs

1//! Deterministic Adaptive Dictionary Engine — Phase 1 core (v3 brief).
2//!
3//! A byte-first categorical engine designed to coexist with the existing
4//! `Column::Categorical` and `FctColumn` types (which stay for backwards
5//! compat). New code that needs a deterministic, memory-efficient
6//! categorical column should prefer `CategoricalColumn` from this module.
7//!
8//! # Design summary
9//!
10//! - `ByteStringPool` — append-only `Vec<u8>` arena with stable
11//!   `(offset, len)` handles. No `String` in the hot path.
12//! - `ByteStrView` — opaque `(offset, len)` handle into a pool.
13//! - `AdaptiveCodes` — 4-arm enum (U8/U16/U32/U64). Promotes deterministically
14//!   when cardinality crosses 256 / 65 536 / 2³² boundaries.
15//! - `ByteDictionary` — `BTreeMap<Vec<u8>, u64>` lookup, `frozen` flag,
16//!   `CategoryOrdering` policy. Deterministic by construction.
17//! - `CategoricalColumn` — codes + dictionary + optional null bitmap.
18//!
19//! # Determinism contract
20//!
21//! 1. All thresholds are integer-only. No float math anywhere.
22//! 2. Lookup is `BTreeMap`, not `HashMap` — no randomized hashing.
23//! 3. Lexical ordering uses raw byte comparison (`Vec<u8>::cmp`), not
24//!    Unicode-aware sort. Cross-machine reproducibility is guaranteed.
25//! 4. Code-width promotion is *lazy* — triggered only when the current arm
26//!    physically cannot hold the next code (i.e., inserting code 256 into
27//!    a U8 arm). It is not predictive.
28//! 5. `intern()` on a frozen dictionary returns `Err`, never silently
29//!    extends.
30//! 6. The same byte sequence interned in two fresh dictionaries with the
31//!    same `CategoryOrdering` produces bit-identical code sequences.
32//!
33//! # What this module does NOT do (deferred)
34//!
35//! - Wiring into TidyView verbs (Phase 2)
36//! - Categorical-aware group_by/join (Phase 3)
37//! - Replacement of existing `Column::Categorical` / `FctColumn`
38//! - Language-level `.cjcl` builtins
39//!
40//! # Layout
41//!
42//! Public API at the top, helpers below, `#[cfg(test)] mod tests` at the
43//! bottom. Inline unit tests pin every invariant in the determinism
44//! contract; bolero fuzz lives in `tests/bolero_fuzz/categorical_dictionary_fuzz.rs`.
45
46use crate::BitMask;
47use std::collections::BTreeMap;
48
49// ════════════════════════════════════════════════════════════════════════
50//  ByteStringPool — append-only Vec<u8> arena with stable handles
51// ════════════════════════════════════════════════════════════════════════
52
53/// Append-only byte arena. Each interned byte sequence gets a stable
54/// `ByteStrView` handle whose `(offset, len)` survives all subsequent
55/// insertions (the underlying `Vec<u8>` may reallocate, but the indices
56/// into it are stable).
57///
58/// Pool capacity:
59/// - `bytes`: up to 4 GiB total raw payload (`u32` offsets).
60/// - `offsets`: up to 2³² distinct entries.
61/// - `lens`: up to 2³² individual byte length (per-entry cap is also 2³²).
62///
63/// These limits are documented but not actively enforced beyond a
64/// `try_push` API: in v3 use the pool well below those numbers; if a
65/// future workload approaches the cap, promote to a `u64`-offset variant.
66#[derive(Debug, Clone, Default, PartialEq, Eq)]
67pub struct ByteStringPool {
68    bytes: Vec<u8>,
69    offsets: Vec<u32>,
70    lens: Vec<u32>,
71}
72
73impl ByteStringPool {
74    /// Create an empty pool.
75    pub fn new() -> Self {
76        Self::default()
77    }
78
79    /// Number of entries currently in the pool.
80    pub fn len(&self) -> usize {
81        self.offsets.len()
82    }
83
84    /// True if the pool has no entries.
85    pub fn is_empty(&self) -> bool {
86        self.offsets.is_empty()
87    }
88
89    /// Total bytes stored (sum of all entry lengths). O(1).
90    pub fn byte_size(&self) -> usize {
91        self.bytes.len()
92    }
93
94    /// Append a byte sequence and return a stable view.
95    ///
96    /// Returns `Err` if the per-entry length exceeds `u32::MAX` or the
97    /// total byte payload would exceed `u32::MAX`. These limits are wide
98    /// enough that real-world pipelines never hit them, but we surface
99    /// them as errors rather than panic for safety at boundaries.
100    pub fn push(&mut self, bytes: &[u8]) -> Result<ByteStrView, ByteDictError> {
101        let len = u32::try_from(bytes.len()).map_err(|_| ByteDictError::EntryTooLong {
102            got: bytes.len(),
103        })?;
104        let new_total = self.bytes.len().checked_add(bytes.len()).ok_or(
105            ByteDictError::PoolOverflow {
106                attempted: bytes.len(),
107                current: self.bytes.len(),
108            },
109        )?;
110        let offset = u32::try_from(self.bytes.len()).map_err(|_| ByteDictError::PoolOverflow {
111            attempted: bytes.len(),
112            current: self.bytes.len(),
113        })?;
114        if new_total > u32::MAX as usize {
115            return Err(ByteDictError::PoolOverflow {
116                attempted: bytes.len(),
117                current: self.bytes.len(),
118            });
119        }
120        self.bytes.extend_from_slice(bytes);
121        self.offsets.push(offset);
122        self.lens.push(len);
123        Ok(ByteStrView { offset, len })
124    }
125
126    /// Resolve a view back to its byte slice. Panics in debug builds if the
127    /// view points outside the pool; in release builds returns an empty
128    /// slice for an out-of-bounds view (defensive — should never happen if
129    /// views are only constructed by `push`).
130    pub fn get(&self, view: ByteStrView) -> &[u8] {
131        let start = view.offset as usize;
132        let end = start.saturating_add(view.len as usize);
133        if end > self.bytes.len() {
134            debug_assert!(false, "ByteStrView out of bounds for pool");
135            return &[];
136        }
137        &self.bytes[start..end]
138    }
139
140    /// Resolve a view by entry index (position in insertion order).
141    ///
142    /// Returns `None` if `idx >= self.len()`.
143    pub fn get_by_index(&self, idx: usize) -> Option<&[u8]> {
144        let offset = *self.offsets.get(idx)? as usize;
145        let len = *self.lens.get(idx)? as usize;
146        Some(&self.bytes[offset..offset + len])
147    }
148
149    /// Iterate `(index, bytes)` in insertion order. Deterministic.
150    pub fn iter(&self) -> impl Iterator<Item = (usize, &[u8])> + '_ {
151        self.offsets.iter().zip(self.lens.iter()).enumerate().map(
152            move |(i, (&off, &len))| {
153                let start = off as usize;
154                let end = start + len as usize;
155                (i, &self.bytes[start..end])
156            },
157        )
158    }
159
160    /// Iterate views in insertion order. Use when you want to keep the
161    /// `(offset, len)` representation (e.g. to thread it through a
162    /// dictionary lookup).
163    pub fn iter_views(&self) -> impl Iterator<Item = ByteStrView> + '_ {
164        self.offsets
165            .iter()
166            .zip(self.lens.iter())
167            .map(|(&offset, &len)| ByteStrView { offset, len })
168    }
169}
170
171/// Opaque handle into a `ByteStringPool`. Cheap to copy.
172///
173/// Two views are equal if and only if their `(offset, len)` are equal —
174/// this does NOT compare bytes. Use `ByteStringPool::get` to fetch the
175/// payload before doing a content comparison.
176#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
177pub struct ByteStrView {
178    pub offset: u32,
179    pub len: u32,
180}
181
182impl ByteStrView {
183    /// Length in bytes.
184    pub fn byte_len(self) -> usize {
185        self.len as usize
186    }
187
188    /// True if the view points to an empty byte sequence.
189    pub fn is_empty(self) -> bool {
190        self.len == 0
191    }
192}
193
194// ════════════════════════════════════════════════════════════════════════
195//  AdaptiveCodes — code storage with deterministic width promotion
196// ════════════════════════════════════════════════════════════════════════
197
198/// Adaptive-width code storage. Promotes to a wider arm only when the
199/// current arm physically cannot hold the next code:
200///
201/// - U8  → U16  when inserting a code ≥ 256
202/// - U16 → U32  when inserting a code ≥ 65 536
203/// - U32 → U64  when inserting a code ≥ 2³²
204///
205/// Promotion preserves all existing codes bit-for-bit. The `len()` and
206/// the sequence of values returned by `iter()` are invariant under
207/// promotion.
208#[derive(Debug, Clone, PartialEq, Eq)]
209pub enum AdaptiveCodes {
210    U8(Vec<u8>),
211    U16(Vec<u16>),
212    U32(Vec<u32>),
213    U64(Vec<u64>),
214}
215
216impl AdaptiveCodes {
217    /// Empty U8-arm storage.
218    pub fn new() -> Self {
219        AdaptiveCodes::U8(Vec::new())
220    }
221
222    /// Pre-allocated empty U8-arm storage.
223    pub fn with_capacity(cap: usize) -> Self {
224        AdaptiveCodes::U8(Vec::with_capacity(cap))
225    }
226
227    /// Number of codes stored.
228    pub fn len(&self) -> usize {
229        match self {
230            AdaptiveCodes::U8(v) => v.len(),
231            AdaptiveCodes::U16(v) => v.len(),
232            AdaptiveCodes::U32(v) => v.len(),
233            AdaptiveCodes::U64(v) => v.len(),
234        }
235    }
236
237    /// True if no codes stored.
238    pub fn is_empty(&self) -> bool {
239        self.len() == 0
240    }
241
242    /// Append a code. Promotes the arm in-place if `code` does not fit.
243    ///
244    /// Promotion is *lazy*: a U8 arm with codes [0, 1, 2] and an insert
245    /// of `255` stays U8; insert of `256` promotes to U16 with codes
246    /// [0, 1, 2, 256].
247    pub fn push(&mut self, code: u64) {
248        // Promote if needed.
249        if self.needs_promotion_for(code) {
250            self.promote_to_fit(code);
251        }
252        match self {
253            AdaptiveCodes::U8(v) => v.push(code as u8),
254            AdaptiveCodes::U16(v) => v.push(code as u16),
255            AdaptiveCodes::U32(v) => v.push(code as u32),
256            AdaptiveCodes::U64(v) => v.push(code),
257        }
258    }
259
260    /// Get the code at index `i`. Panics if `i >= len()`.
261    pub fn get(&self, i: usize) -> u64 {
262        match self {
263            AdaptiveCodes::U8(v) => v[i] as u64,
264            AdaptiveCodes::U16(v) => v[i] as u64,
265            AdaptiveCodes::U32(v) => v[i] as u64,
266            AdaptiveCodes::U64(v) => v[i],
267        }
268    }
269
270    /// Iterate codes as `u64` (the widest representation, lossless for all
271    /// arms).
272    pub fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
273        match self {
274            AdaptiveCodes::U8(v) => Box::new(v.iter().map(|&x| x as u64)),
275            AdaptiveCodes::U16(v) => Box::new(v.iter().map(|&x| x as u64)),
276            AdaptiveCodes::U32(v) => Box::new(v.iter().map(|&x| x as u64)),
277            AdaptiveCodes::U64(v) => Box::new(v.iter().copied()),
278        }
279    }
280
281    /// Width in bytes per code (1, 2, 4, or 8). Useful for memory
282    /// accounting and benchmarks.
283    pub fn width_bytes(&self) -> usize {
284        match self {
285            AdaptiveCodes::U8(_) => 1,
286            AdaptiveCodes::U16(_) => 2,
287            AdaptiveCodes::U32(_) => 4,
288            AdaptiveCodes::U64(_) => 8,
289        }
290    }
291
292    /// True if this arm cannot represent `code`.
293    fn needs_promotion_for(&self, code: u64) -> bool {
294        match self {
295            AdaptiveCodes::U8(_) => code > u8::MAX as u64,
296            AdaptiveCodes::U16(_) => code > u16::MAX as u64,
297            AdaptiveCodes::U32(_) => code > u32::MAX as u64,
298            AdaptiveCodes::U64(_) => false,
299        }
300    }
301
302    /// Promote to the smallest arm that can hold `code`.
303    fn promote_to_fit(&mut self, code: u64) {
304        // Move out of self temporarily.
305        let old = std::mem::replace(self, AdaptiveCodes::U64(Vec::new()));
306        let target = if code <= u16::MAX as u64 {
307            // U8 → U16
308            let v: Vec<u16> = match old {
309                AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u16).collect(),
310                AdaptiveCodes::U16(v) => v,
311                AdaptiveCodes::U32(_) | AdaptiveCodes::U64(_) => {
312                    debug_assert!(false, "promote_to_fit called with shrinking target");
313                    Vec::new()
314                }
315            };
316            AdaptiveCodes::U16(v)
317        } else if code <= u32::MAX as u64 {
318            let v: Vec<u32> = match old {
319                AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u32).collect(),
320                AdaptiveCodes::U16(v) => v.into_iter().map(|x| x as u32).collect(),
321                AdaptiveCodes::U32(v) => v,
322                AdaptiveCodes::U64(_) => {
323                    debug_assert!(false, "promote_to_fit called with shrinking target");
324                    Vec::new()
325                }
326            };
327            AdaptiveCodes::U32(v)
328        } else {
329            let v: Vec<u64> = match old {
330                AdaptiveCodes::U8(v) => v.into_iter().map(|x| x as u64).collect(),
331                AdaptiveCodes::U16(v) => v.into_iter().map(|x| x as u64).collect(),
332                AdaptiveCodes::U32(v) => v.into_iter().map(|x| x as u64).collect(),
333                AdaptiveCodes::U64(v) => v,
334            };
335            AdaptiveCodes::U64(v)
336        };
337        *self = target;
338    }
339}
340
341impl Default for AdaptiveCodes {
342    fn default() -> Self {
343        Self::new()
344    }
345}
346
347// ════════════════════════════════════════════════════════════════════════
348//  CategoryOrdering — how codes are assigned to byte sequences
349// ════════════════════════════════════════════════════════════════════════
350
351/// Policy for assigning codes to byte sequences during dictionary
352/// construction.
353///
354/// - `FirstSeen`: insertion order. Code = position of first occurrence.
355///   Best for ingestion stability (the first row sets the order, and
356///   subsequent rows of the same value reuse the same code).
357/// - `Lexical`: byte-lexicographic order. The dictionary collects all
358///   distinct bytes first, then assigns codes by sorting via
359///   `Vec<u8>::cmp`. Best for cross-dataset reproducibility (any
360///   permutation of the same input set produces the same code map).
361/// - `Explicit(values)`: user provides the canonical order. Codes are
362///   assigned `0..values.len()` in the order given. Best for
363///   production ML schemas where the inference-time code map must
364///   match training exactly.
365#[derive(Debug, Clone, PartialEq, Eq)]
366pub enum CategoryOrdering {
367    FirstSeen,
368    Lexical,
369    Explicit(Vec<Vec<u8>>),
370}
371
372// ════════════════════════════════════════════════════════════════════════
373//  UnknownCategoryPolicy — handling values not in the dictionary
374// ════════════════════════════════════════════════════════════════════════
375
376/// Policy for `intern_with_policy` when a byte sequence is not in the
377/// dictionary AND the dictionary is `frozen`.
378///
379/// - `Error`: return `Err(UnknownCategory)`. Strictest. Recommended for
380///   safety-critical inference pipelines where a previously-unseen
381///   category indicates a real data issue.
382/// - `MapToOther`: return the code of a designated "Other" bucket. The
383///   bucket must be added before freezing — the dictionary does not
384///   silently create one.
385/// - `MapToNull`: return `Ok(None)`, signalling the caller should mark
386///   the row as null in the categorical column's null bitmap.
387/// - `ExtendDictionary`: ignore the frozen flag and add the value. Only
388///   safe in training; never in inference.
389#[derive(Debug, Clone, PartialEq, Eq)]
390pub enum UnknownCategoryPolicy {
391    Error,
392    MapToOther { other_code: u64 },
393    MapToNull,
394    ExtendDictionary,
395}
396
397/// Outcome of `intern_with_policy`.
398#[derive(Debug, Clone, Copy, PartialEq, Eq)]
399pub enum InternedCode {
400    /// A concrete code was assigned or looked up.
401    Code(u64),
402    /// `MapToNull` was active and the value was unknown — the caller
403    /// should set the null bit at this row.
404    Null,
405}
406
407// ════════════════════════════════════════════════════════════════════════
408//  Errors
409// ════════════════════════════════════════════════════════════════════════
410
411#[derive(Debug, Clone, PartialEq, Eq)]
412pub enum ByteDictError {
413    /// Single entry exceeded `u32::MAX` bytes.
414    EntryTooLong { got: usize },
415    /// Total pool would exceed `u32::MAX` bytes.
416    PoolOverflow { attempted: usize, current: usize },
417    /// Attempted to intern into a frozen dictionary with `Error` policy.
418    UnknownCategory,
419    /// Attempted to intern into a frozen dictionary.
420    Frozen,
421    /// Explicit ordering was provided but contains duplicates.
422    ExplicitOrderingHasDuplicates,
423    /// `MapToOther` policy was used but the `other_code` is not present
424    /// in the dictionary.
425    OtherCodeNotInDictionary { code: u64 },
426}
427
428impl std::fmt::Display for ByteDictError {
429    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
430        match self {
431            ByteDictError::EntryTooLong { got } => {
432                write!(f, "ByteStringPool entry too long: {got} bytes (max {})", u32::MAX)
433            }
434            ByteDictError::PoolOverflow { attempted, current } => write!(
435                f,
436                "ByteStringPool overflow: cannot append {attempted} bytes (current {current})"
437            ),
438            ByteDictError::UnknownCategory => {
439                write!(f, "unknown category in frozen dictionary (Error policy)")
440            }
441            ByteDictError::Frozen => write!(f, "dictionary is frozen"),
442            ByteDictError::ExplicitOrderingHasDuplicates => {
443                write!(f, "explicit ordering contains duplicate values")
444            }
445            ByteDictError::OtherCodeNotInDictionary { code } => {
446                write!(f, "MapToOther policy: other_code {code} not in dictionary")
447            }
448        }
449    }
450}
451
452impl std::error::Error for ByteDictError {}
453
454// ════════════════════════════════════════════════════════════════════════
455//  ByteDictionary — pool + BTreeMap lookup + ordering policy + frozen flag
456// ════════════════════════════════════════════════════════════════════════
457
458/// Deterministic byte-keyed dictionary.
459///
460/// Owns:
461/// - a `ByteStringPool` storing every distinct byte sequence
462/// - a `BTreeMap<Vec<u8>, u64>` mapping bytes → code (deterministic
463///   iteration; no randomized hashing)
464/// - an `ordering` policy controlling code assignment
465/// - a `frozen` flag preventing inadvertent extension
466///
467/// # Code stability across runs
468///
469/// For a given `(input bytes sequence, ordering)` pair, the dictionary
470/// produces bit-identical codes on every machine, every architecture,
471/// every locale. The only inputs to the code map are the raw bytes and
472/// the ordering policy.
473#[derive(Debug, Clone)]
474pub struct ByteDictionary {
475    pool: ByteStringPool,
476    /// Maps owned `Vec<u8>` keys → assigned code.
477    /// `Vec<u8>` (owned) is required because the pool may reallocate
478    /// during interning — we cannot key by `ByteStrView` until we know
479    /// every key has stable storage.
480    lookup: BTreeMap<Vec<u8>, u64>,
481    /// View-by-code lookup. `code_to_view[c as usize]` returns the
482    /// `ByteStrView` for code `c`. Codes are dense `0..n_categories`.
483    code_to_view: Vec<ByteStrView>,
484    frozen: bool,
485    ordering: CategoryOrdering,
486    /// v3 Phase 7: optional sealed `DHarht` lookup accelerator. Built
487    /// by `seal_for_lookup()`. Pre-seal the field is `None` and lookups
488    /// fall through to `lookup` (BTreeMap). Post-seal `dharht` becomes
489    /// the primary lookup; `lookup` is retained for canonical iteration
490    /// (insertion-order display, snapshot output, range scans).
491    dharht: Option<crate::detcoll::DHarht<u64>>,
492    /// v3 Phase 11: optional u64-hash content-cache built on
493    /// `SealedU64Map` (which wraps `DHarhtMemory`). Lets callers who
494    /// already have the deterministic 64-bit hash of bytes (from
495    /// snapshot diffing, content-addressed storage, etc.) look up the
496    /// dictionary code via `lookup_by_hash` without rehashing.
497    /// Built by `seal_with_u64_hash_index()`; `None` otherwise.
498    hash_index: Option<crate::detcoll::SealedU64Map<u64>>,
499}
500
501impl ByteDictionary {
502    /// Empty dictionary with `FirstSeen` ordering. Not frozen.
503    pub fn new() -> Self {
504        Self::with_ordering(CategoryOrdering::FirstSeen)
505    }
506
507    /// Empty dictionary with the given ordering. For `Explicit`, codes are
508    /// pre-populated `0..values.len()` immediately.
509    pub fn with_ordering(ordering: CategoryOrdering) -> Self {
510        let mut dict = ByteDictionary {
511            pool: ByteStringPool::new(),
512            lookup: BTreeMap::new(),
513            code_to_view: Vec::new(),
514            frozen: false,
515            ordering: ordering.clone(),
516            dharht: None,
517            hash_index: None,
518        };
519        if let CategoryOrdering::Explicit(values) = ordering {
520            for v in values {
521                let _ = dict.intern_internal(&v, false);
522            }
523        }
524        dict
525    }
526
527    /// Build an `Explicit`-ordered dictionary from the given values. The
528    /// dictionary is returned NOT frozen — the caller can choose to
529    /// freeze it. Returns `Err` if `values` contains duplicates.
530    pub fn from_explicit(values: Vec<Vec<u8>>) -> Result<Self, ByteDictError> {
531        // Detect duplicates via a sorted scan (deterministic).
532        let mut sorted = values.clone();
533        sorted.sort();
534        for w in sorted.windows(2) {
535            if w[0] == w[1] {
536                return Err(ByteDictError::ExplicitOrderingHasDuplicates);
537            }
538        }
539        Ok(Self::with_ordering(CategoryOrdering::Explicit(values)))
540    }
541
542    /// Number of categories.
543    pub fn len(&self) -> usize {
544        self.code_to_view.len()
545    }
546
547    /// True if no categories.
548    pub fn is_empty(&self) -> bool {
549        self.code_to_view.is_empty()
550    }
551
552    /// True if the dictionary is frozen.
553    pub fn is_frozen(&self) -> bool {
554        self.frozen
555    }
556
557    /// The ordering policy.
558    pub fn ordering(&self) -> &CategoryOrdering {
559        &self.ordering
560    }
561
562    /// Freeze the dictionary. After freezing, `intern` returns
563    /// `Err(Frozen)` for unknown values; `intern_with_policy` honours
564    /// `UnknownCategoryPolicy`. Lookups continue to work.
565    pub fn freeze(&mut self) {
566        self.frozen = true;
567    }
568
569    /// Direct read access to the underlying pool.
570    pub fn pool(&self) -> &ByteStringPool {
571        &self.pool
572    }
573
574    /// Resolve a code back to its byte payload. Returns `None` for
575    /// out-of-range codes.
576    pub fn get(&self, code: u64) -> Option<&[u8]> {
577        let idx = usize::try_from(code).ok()?;
578        let view = *self.code_to_view.get(idx)?;
579        Some(self.pool.get(view))
580    }
581
582    /// Look up a byte sequence. Does not extend the dictionary. Returns
583    /// `None` if not present.
584    ///
585    /// v3 Phase 7: when the dictionary has been `seal_for_lookup()`-ed,
586    /// the primary lookup goes through the `DHarht` Memory profile.
587    /// Falls back to the `BTreeMap` for unsealed dictionaries (which is
588    /// also the canonical iteration source).
589    pub fn lookup(&self, bytes: &[u8]) -> Option<u64> {
590        if let Some(d) = &self.dharht {
591            return d.get_bytes(bytes).copied();
592        }
593        self.lookup.get(bytes).copied()
594    }
595
596    /// v3 Phase 7: build the `DHarht` lookup accelerator and seal it.
597    /// After this call, `lookup()` routes through the `DHarht` and the
598    /// dictionary should be treated as read-only for performance
599    /// reasons (mutations are not blocked but invalidate the
600    /// accelerator — they trigger a debug-build assertion). The
601    /// `BTreeMap` lookup table is preserved for canonical iteration
602    /// and for `range`-style queries that the `DHarht` does not
603    /// support.
604    ///
605    /// Spec compliance:
606    /// - splitmix64 deterministic scattering ✓
607    /// - 256 shards (power of two) ✓
608    /// - sealed sparse 16-bit front directory ✓
609    /// - MicroBucket16 with deterministic BTreeMap overflow on
610    ///   bucket > 16 (no silent entry loss) ✓
611    /// - full key equality on every successful lookup ✓
612    pub fn seal_for_lookup(&mut self) {
613        let mut d = crate::detcoll::DHarht::new();
614        for (k, &code) in &self.lookup {
615            d.insert_bytes(k.as_slice(), code);
616        }
617        d.seal_for_lookup();
618        self.dharht = Some(d);
619    }
620
621    /// True if the dictionary has been sealed with a `DHarht`
622    /// accelerator. Distinct from the `frozen` flag (which controls
623    /// extension, not lookup backend).
624    pub fn is_lookup_sealed(&self) -> bool {
625        self.dharht.is_some()
626    }
627
628    /// Diagnostic: number of entries that overflowed to the per-shard
629    /// `BTreeMap` fallback in the `DHarht`. Always 0 if not sealed.
630    pub fn dharht_overflow_count(&self) -> u64 {
631        self.dharht.as_ref().map(|d| d.overflow_count()).unwrap_or(0)
632    }
633
634    /// v3 Phase 11: build a u64-hash content-addressed lookup index
635    /// using `SealedU64Map` (DHarhtMemory profile). After this call,
636    /// `lookup_by_hash(h)` returns the dictionary code for whichever
637    /// byte sequence hashes to `h`. The hash function is the
638    /// workspace's deterministic `crate::detcoll::hash_bytes` so the
639    /// caller's pre-computed hash and the index agree byte-for-byte.
640    ///
641    /// Use case: snapshot diffing, content-addressed storage,
642    /// reproducibility-critical pipelines where the hash is the
643    /// canonical identifier.
644    ///
645    /// This is **independent** of `seal_for_lookup()` — you can call
646    /// either, both, or neither. Both indices, when built, are
647    /// mutually consistent: they reference the same code space.
648    pub fn seal_with_u64_hash_index(&mut self) {
649        let mut idx = crate::detcoll::SealedU64Map::new();
650        for (k, &code) in &self.lookup {
651            let h = crate::detcoll::hash_bytes(k.as_slice());
652            idx.insert(h, code);
653        }
654        idx.seal();
655        self.hash_index = Some(idx);
656    }
657
658    /// True if the u64-hash index has been built.
659    pub fn is_hash_indexed(&self) -> bool {
660        self.hash_index.is_some()
661    }
662
663    /// Look up a code by the deterministic u64 hash of its bytes.
664    /// Returns `None` if the hash is unknown OR if the hash index has
665    /// not been built (call `seal_with_u64_hash_index()` first).
666    ///
667    /// **Hash collision safety**: this is a hash-only lookup with no
668    /// full byte equality check. Two distinct byte sequences hashing
669    /// to the same `u64` would return one of the two codes — that's
670    /// `O(2^-64)` for `splitmix64`-mixed hashes (well-distributed
671    /// inputs). For safety-critical paths use `lookup_by_hash_verify`
672    /// which carries the original bytes and verifies.
673    pub fn lookup_by_hash(&self, hash: u64) -> Option<u64> {
674        self.hash_index.as_ref()?.get(hash).copied()
675    }
676
677    /// Hash-keyed lookup with explicit byte verification. Returns the
678    /// code only if both (a) the hash maps to a known code AND (b) the
679    /// stored bytes for that code match `bytes` exactly. This closes
680    /// the `O(2^-64)` collision window of `lookup_by_hash`.
681    pub fn lookup_by_hash_verify(&self, hash: u64, bytes: &[u8]) -> Option<u64> {
682        let code = self.lookup_by_hash(hash)?;
683        let stored = self.get(code)?;
684        if stored == bytes { Some(code) } else { None }
685    }
686
687    /// Intern a byte sequence. If not present and the dictionary is not
688    /// frozen, assigns a new code (only valid under `FirstSeen` /
689    /// `ExtendDictionary`-ish flows). Under `Lexical` ordering this method
690    /// works in *streaming mode* — codes are assigned in encounter order,
691    /// then `seal_lexical()` re-sorts them at the end. For `Explicit`,
692    /// `intern` errors on unknown values (the explicit list is the
693    /// authority).
694    pub fn intern(&mut self, bytes: &[u8]) -> Result<u64, ByteDictError> {
695        self.intern_internal(bytes, true)
696    }
697
698    /// Intern with explicit unknown policy. This is the inference-time
699    /// API: it does not require the dictionary be unfrozen.
700    pub fn intern_with_policy(
701        &mut self,
702        bytes: &[u8],
703        policy: &UnknownCategoryPolicy,
704    ) -> Result<InternedCode, ByteDictError> {
705        if let Some(c) = self.lookup.get(bytes) {
706            return Ok(InternedCode::Code(*c));
707        }
708        if !self.frozen {
709            // If not frozen, behave like intern — extend the dictionary.
710            return self.intern_internal(bytes, true).map(InternedCode::Code);
711        }
712        match policy {
713            UnknownCategoryPolicy::Error => Err(ByteDictError::UnknownCategory),
714            UnknownCategoryPolicy::MapToNull => Ok(InternedCode::Null),
715            UnknownCategoryPolicy::MapToOther { other_code } => {
716                if (*other_code as usize) < self.code_to_view.len() {
717                    Ok(InternedCode::Code(*other_code))
718                } else {
719                    Err(ByteDictError::OtherCodeNotInDictionary { code: *other_code })
720                }
721            }
722            UnknownCategoryPolicy::ExtendDictionary => {
723                // Bypass the freeze for this call only.
724                self.intern_internal(bytes, false).map(InternedCode::Code)
725            }
726        }
727    }
728
729    /// Internal interning. `respect_frozen` controls whether the frozen
730    /// flag is honoured (used to implement `ExtendDictionary` policy).
731    fn intern_internal(
732        &mut self,
733        bytes: &[u8],
734        respect_frozen: bool,
735    ) -> Result<u64, ByteDictError> {
736        if let Some(&c) = self.lookup.get(bytes) {
737            return Ok(c);
738        }
739        if respect_frozen && self.frozen {
740            return Err(ByteDictError::Frozen);
741        }
742        // For Explicit ordering, a non-explicit value is unknown.
743        if let CategoryOrdering::Explicit(_) = &self.ordering {
744            if respect_frozen {
745                return Err(ByteDictError::UnknownCategory);
746            }
747        }
748        let view = self.pool.push(bytes)?;
749        let code = self.code_to_view.len() as u64;
750        self.code_to_view.push(view);
751        self.lookup.insert(bytes.to_vec(), code);
752        Ok(code)
753    }
754
755    /// Re-assign codes lexicographically (only useful when ordering is
756    /// `Lexical`). Returns the permutation `old_code → new_code` so
757    /// callers can rewrite their code arrays.
758    ///
759    /// For `FirstSeen` and `Explicit`, this is a no-op and returns the
760    /// identity permutation.
761    pub fn seal_lexical(&mut self) -> Vec<u64> {
762        if !matches!(self.ordering, CategoryOrdering::Lexical) {
763            return (0..self.code_to_view.len() as u64).collect();
764        }
765        // Collect (bytes_owned, old_code), sort by bytes, derive perm.
766        let mut entries: Vec<(Vec<u8>, u64)> = self
767            .lookup
768            .iter()
769            .map(|(k, &v)| (k.clone(), v))
770            .collect();
771        entries.sort_by(|a, b| a.0.cmp(&b.0));
772        let mut perm = vec![0u64; self.code_to_view.len()];
773        // Rebuild code_to_view + lookup.
774        let mut new_pool = ByteStringPool::new();
775        let mut new_code_to_view: Vec<ByteStrView> = Vec::with_capacity(entries.len());
776        let mut new_lookup: BTreeMap<Vec<u8>, u64> = BTreeMap::new();
777        for (new_code, (bytes, old_code)) in entries.into_iter().enumerate() {
778            let view = new_pool
779                .push(&bytes)
780                .expect("seal_lexical: re-push cannot exceed original pool size");
781            new_code_to_view.push(view);
782            new_lookup.insert(bytes, new_code as u64);
783            perm[old_code as usize] = new_code as u64;
784        }
785        self.pool = new_pool;
786        self.code_to_view = new_code_to_view;
787        self.lookup = new_lookup;
788        perm
789    }
790
791    /// Iterate `(code, bytes)` pairs in code order. Deterministic.
792    pub fn iter(&self) -> impl Iterator<Item = (u64, &[u8])> + '_ {
793        self.code_to_view.iter().enumerate().map(move |(i, &v)| {
794            (i as u64, self.pool.get(v))
795        })
796    }
797}
798
799impl Default for ByteDictionary {
800    fn default() -> Self {
801        Self::new()
802    }
803}
804
805// ════════════════════════════════════════════════════════════════════════
806//  CategoricalColumn — codes + dictionary + optional null bitmap
807// ════════════════════════════════════════════════════════════════════════
808
809/// A categorical column: a vector of codes pointing into a shared
810/// `ByteDictionary`, plus an optional null bitmap.
811///
812/// # Invariant
813///
814/// `codes.len() == nrows`. If `nulls` is present, `nulls.nrows() ==
815/// nrows`. A row is valid iff `nulls.is_none() || nulls.unwrap().get(i)`.
816/// (A `1` bit means valid — same convention as `BitMask` elsewhere in
817/// cjc-data.)
818///
819/// # Equality
820///
821/// Two `CategoricalColumn`s are deep-equal iff they have the same codes,
822/// same dictionary contents (including ordering), and same null pattern.
823#[derive(Debug, Clone)]
824pub struct CategoricalColumn {
825    codes: AdaptiveCodes,
826    dictionary: ByteDictionary,
827    nulls: Option<BitMask>,
828}
829
830impl CategoricalColumn {
831    /// Empty column with a fresh `FirstSeen` dictionary.
832    pub fn new() -> Self {
833        Self {
834            codes: AdaptiveCodes::new(),
835            dictionary: ByteDictionary::new(),
836            nulls: None,
837        }
838    }
839
840    /// Empty column with the given dictionary (used to share a dictionary
841    /// across columns or to seed with `Explicit` ordering).
842    pub fn with_dictionary(dictionary: ByteDictionary) -> Self {
843        Self {
844            codes: AdaptiveCodes::new(),
845            dictionary,
846            nulls: None,
847        }
848    }
849
850    /// Number of rows.
851    pub fn len(&self) -> usize {
852        self.codes.len()
853    }
854
855    /// True if no rows.
856    pub fn is_empty(&self) -> bool {
857        self.codes.is_empty()
858    }
859
860    /// Read-only access to the dictionary.
861    pub fn dictionary(&self) -> &ByteDictionary {
862        &self.dictionary
863    }
864
865    /// Read-only access to the codes.
866    pub fn codes(&self) -> &AdaptiveCodes {
867        &self.codes
868    }
869
870    /// Read-only access to the null bitmap if present.
871    pub fn nulls(&self) -> Option<&BitMask> {
872        self.nulls.as_ref()
873    }
874
875    /// True if the row at index `i` is null.
876    pub fn is_null(&self, i: usize) -> bool {
877        match &self.nulls {
878            None => false,
879            Some(b) => !b.get(i),
880        }
881    }
882
883    /// Append a non-null value, interning it in the dictionary.
884    ///
885    /// Returns the assigned code.
886    pub fn push(&mut self, bytes: &[u8]) -> Result<u64, ByteDictError> {
887        let code = self.dictionary.intern(bytes)?;
888        self.codes.push(code);
889        if let Some(b) = &mut self.nulls {
890            // Extend null bitmap with a "valid" bit. BitMask is fixed-size,
891            // so we rebuild — this is fine because nulls grow with the
892            // column.
893            *b = bitmask_with_extra_valid(b);
894        }
895        Ok(code)
896    }
897
898    /// Append a null. Allocates the null bitmap on first call if absent.
899    pub fn push_null(&mut self) {
900        // Use sentinel code 0 for the null cell — readers must check
901        // `is_null` first. (We deliberately do not use a "null code" in
902        // the dictionary itself: the dictionary stays clean of synthetic
903        // entries.)
904        self.codes.push(0);
905        match &mut self.nulls {
906            None => {
907                // Existing rows are all valid. Build a fresh bitmap of
908                // length codes.len() with all-ones for prior rows and a
909                // zero for this null.
910                let n = self.codes.len();
911                let mut words = vec![0u64; (n + 63) / 64];
912                // Set bits 0..n-1 to 1 (valid for prior rows; the new
913                // null is at bit n-1).
914                for i in 0..(n - 1) {
915                    words[i / 64] |= 1u64 << (i % 64);
916                }
917                self.nulls =
918                    Some(BitMask::from_words_for_test(words, n));
919            }
920            Some(b) => {
921                *b = bitmask_with_extra_invalid(b);
922            }
923        }
924    }
925
926    /// Append a value with explicit unknown-policy handling. If the
927    /// policy returns `InternedCode::Null`, the row is recorded as null.
928    pub fn push_with_policy(
929        &mut self,
930        bytes: &[u8],
931        policy: &UnknownCategoryPolicy,
932    ) -> Result<(), ByteDictError> {
933        match self.dictionary.intern_with_policy(bytes, policy)? {
934            InternedCode::Code(c) => {
935                self.codes.push(c);
936                if let Some(b) = &mut self.nulls {
937                    *b = bitmask_with_extra_valid(b);
938                }
939            }
940            InternedCode::Null => {
941                self.push_null();
942            }
943        }
944        Ok(())
945    }
946
947    /// Resolve the bytes for row `i`. Returns `None` if the row is null.
948    pub fn get(&self, i: usize) -> Option<&[u8]> {
949        if self.is_null(i) {
950            return None;
951        }
952        let code = self.codes.get(i);
953        self.dictionary.get(code)
954    }
955
956    /// Iterate `(row_index, Option<bytes>)`. Deterministic.
957    pub fn iter(&self) -> impl Iterator<Item = (usize, Option<&[u8]>)> + '_ {
958        (0..self.len()).map(move |i| (i, self.get(i)))
959    }
960
961    /// Run `seal_lexical` on the dictionary and rewrite the codes. After
962    /// this call the dictionary's iteration order is byte-lex; the codes
963    /// are remapped accordingly. Bit-identical results regardless of
964    /// insertion order.
965    pub fn seal_lexical(&mut self) {
966        if !matches!(self.dictionary.ordering(), CategoryOrdering::Lexical) {
967            return;
968        }
969        let perm = self.dictionary.seal_lexical();
970        let mut new_codes = AdaptiveCodes::with_capacity(self.codes.len());
971        for c in self.codes.iter() {
972            new_codes.push(perm[c as usize]);
973        }
974        self.codes = new_codes;
975    }
976
977    /// Build a `CategoricalProfile` summarising this column. Internal
978    /// stats only; not exposed as a language-level builtin in Phase 1.
979    pub fn profile(&self) -> CategoricalProfile {
980        let cardinality = self.dictionary.len();
981        let nrows = self.len();
982        let missing = match &self.nulls {
983            None => 0,
984            Some(b) => nrows - b.count_ones(),
985        };
986        let bytes_used = self.dictionary.pool().byte_size();
987        let avg_byte_len = if cardinality == 0 {
988            0
989        } else {
990            bytes_used / cardinality
991        };
992        let mut max_byte_len = 0usize;
993        for (_, b) in self.dictionary.iter() {
994            if b.len() > max_byte_len {
995                max_byte_len = b.len();
996            }
997        }
998        let unique_ratio_thousandths = if nrows == 0 {
999            0
1000        } else {
1001            (cardinality as u64).saturating_mul(1000) / nrows as u64
1002        };
1003        CategoricalProfile {
1004            nrows,
1005            cardinality,
1006            missing,
1007            bytes_used,
1008            avg_byte_len,
1009            max_byte_len,
1010            code_width_bytes: self.codes.width_bytes(),
1011            unique_ratio_thousandths,
1012        }
1013    }
1014}
1015
1016impl Default for CategoricalColumn {
1017    fn default() -> Self {
1018        Self::new()
1019    }
1020}
1021
1022/// Internal stats computed by `CategoricalColumn::profile()`. Integer
1023/// fields only — no float math, deterministic.
1024#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1025pub struct CategoricalProfile {
1026    pub nrows: usize,
1027    pub cardinality: usize,
1028    pub missing: usize,
1029    pub bytes_used: usize,
1030    pub avg_byte_len: usize,
1031    pub max_byte_len: usize,
1032    pub code_width_bytes: usize,
1033    /// `(cardinality / nrows) * 1000` rounded toward zero. Reported in
1034    /// thousandths to keep the profile integer-only.
1035    pub unique_ratio_thousandths: u64,
1036}
1037
1038// ════════════════════════════════════════════════════════════════════════
1039//  BitMask helpers (private)
1040// ════════════════════════════════════════════════════════════════════════
1041
1042/// Build a new BitMask one bit longer than `b`, with the new bit set
1043/// (valid).
1044fn bitmask_with_extra_valid(b: &BitMask) -> BitMask {
1045    let n = b.nrows() + 1;
1046    let mut words: Vec<u64> = b.words_slice().to_vec();
1047    let needed = (n + 63) / 64;
1048    while words.len() < needed {
1049        words.push(0);
1050    }
1051    let i = n - 1;
1052    words[i / 64] |= 1u64 << (i % 64);
1053    BitMask::from_words_for_test(words, n)
1054}
1055
1056/// Build a new BitMask one bit longer than `b`, with the new bit clear
1057/// (invalid).
1058fn bitmask_with_extra_invalid(b: &BitMask) -> BitMask {
1059    let n = b.nrows() + 1;
1060    let mut words: Vec<u64> = b.words_slice().to_vec();
1061    let needed = (n + 63) / 64;
1062    while words.len() < needed {
1063        words.push(0);
1064    }
1065    // The new bit is bit (n-1); leave it clear by default.
1066    BitMask::from_words_for_test(words, n)
1067}
1068
1069// ════════════════════════════════════════════════════════════════════════
1070//  Tests
1071// ════════════════════════════════════════════════════════════════════════
1072
1073#[cfg(test)]
1074mod tests {
1075    use super::*;
1076
1077    // ── ByteStringPool ──────────────────────────────────────────────────
1078
1079    #[test]
1080    fn pool_round_trip_simple() {
1081        let mut p = ByteStringPool::new();
1082        let v_hello = p.push(b"hello").unwrap();
1083        let v_world = p.push(b"world").unwrap();
1084        assert_eq!(p.get(v_hello), b"hello");
1085        assert_eq!(p.get(v_world), b"world");
1086        assert_eq!(p.len(), 2);
1087        assert_eq!(p.byte_size(), 10);
1088    }
1089
1090    #[test]
1091    fn pool_round_trip_empty_strings() {
1092        // Empty byte strings are valid and must round-trip.
1093        let mut p = ByteStringPool::new();
1094        let v_empty = p.push(b"").unwrap();
1095        let v_x = p.push(b"x").unwrap();
1096        let v_empty2 = p.push(b"").unwrap();
1097        assert!(p.get(v_empty).is_empty());
1098        assert_eq!(p.get(v_x), b"x");
1099        assert!(p.get(v_empty2).is_empty());
1100        assert_eq!(p.len(), 3);
1101    }
1102
1103    #[test]
1104    fn pool_round_trip_embedded_nul_and_high_bytes() {
1105        let mut p = ByteStringPool::new();
1106        let payload: &[u8] = &[0u8, 1, 2, 0, 0xff, 0xfe, b'a'];
1107        let v = p.push(payload).unwrap();
1108        assert_eq!(p.get(v), payload);
1109    }
1110
1111    #[test]
1112    fn pool_get_by_index_matches_views() {
1113        let mut p = ByteStringPool::new();
1114        p.push(b"a").unwrap();
1115        p.push(b"bb").unwrap();
1116        p.push(b"ccc").unwrap();
1117        assert_eq!(p.get_by_index(0).unwrap(), b"a");
1118        assert_eq!(p.get_by_index(1).unwrap(), b"bb");
1119        assert_eq!(p.get_by_index(2).unwrap(), b"ccc");
1120        assert!(p.get_by_index(3).is_none());
1121    }
1122
1123    #[test]
1124    fn pool_iter_is_insertion_order() {
1125        let mut p = ByteStringPool::new();
1126        p.push(b"z").unwrap();
1127        p.push(b"a").unwrap();
1128        p.push(b"m").unwrap();
1129        let collected: Vec<&[u8]> = p.iter().map(|(_, b)| b).collect();
1130        assert_eq!(collected, vec![b"z" as &[u8], b"a", b"m"]);
1131    }
1132
1133    // ── AdaptiveCodes promotion ─────────────────────────────────────────
1134
1135    #[test]
1136    fn adaptive_codes_starts_u8() {
1137        let c = AdaptiveCodes::new();
1138        assert!(matches!(c, AdaptiveCodes::U8(_)));
1139        assert_eq!(c.width_bytes(), 1);
1140    }
1141
1142    #[test]
1143    fn adaptive_codes_stays_u8_at_255() {
1144        let mut c = AdaptiveCodes::new();
1145        for i in 0u64..=255 {
1146            c.push(i);
1147        }
1148        assert!(matches!(c, AdaptiveCodes::U8(_)));
1149        assert_eq!(c.len(), 256);
1150        // Verify all values round-trip.
1151        for i in 0u64..=255 {
1152            assert_eq!(c.get(i as usize), i);
1153        }
1154    }
1155
1156    #[test]
1157    fn adaptive_codes_promotes_u8_to_u16_at_256() {
1158        let mut c = AdaptiveCodes::new();
1159        for i in 0u64..=255 {
1160            c.push(i);
1161        }
1162        assert!(matches!(c, AdaptiveCodes::U8(_)));
1163        c.push(256);
1164        assert!(matches!(c, AdaptiveCodes::U16(_)));
1165        // Existing codes preserved bit-for-bit.
1166        for i in 0u64..=255 {
1167            assert_eq!(c.get(i as usize), i);
1168        }
1169        assert_eq!(c.get(256), 256);
1170    }
1171
1172    #[test]
1173    fn adaptive_codes_promotes_u16_to_u32_at_65536() {
1174        let mut c = AdaptiveCodes::U16(vec![0u16, 1, 2, 65_535]);
1175        c.push(65_536);
1176        assert!(matches!(c, AdaptiveCodes::U32(_)));
1177        assert_eq!(c.get(0), 0);
1178        assert_eq!(c.get(3), 65_535);
1179        assert_eq!(c.get(4), 65_536);
1180    }
1181
1182    #[test]
1183    fn adaptive_codes_promotes_u32_to_u64_above_4b() {
1184        let mut c = AdaptiveCodes::U32(vec![0u32, 1, u32::MAX]);
1185        let huge = (u32::MAX as u64) + 1;
1186        c.push(huge);
1187        assert!(matches!(c, AdaptiveCodes::U64(_)));
1188        assert_eq!(c.get(0), 0);
1189        assert_eq!(c.get(2), u32::MAX as u64);
1190        assert_eq!(c.get(3), huge);
1191    }
1192
1193    #[test]
1194    fn adaptive_codes_iter_matches_get() {
1195        let mut c = AdaptiveCodes::new();
1196        for i in 0u64..10 {
1197            c.push(i * 7);
1198        }
1199        let via_iter: Vec<u64> = c.iter().collect();
1200        let via_get: Vec<u64> = (0..c.len()).map(|i| c.get(i)).collect();
1201        assert_eq!(via_iter, via_get);
1202    }
1203
1204    // ── ByteDictionary basic ────────────────────────────────────────────
1205
1206    #[test]
1207    fn dict_intern_assigns_sequential_codes() {
1208        let mut d = ByteDictionary::new();
1209        assert_eq!(d.intern(b"red").unwrap(), 0);
1210        assert_eq!(d.intern(b"green").unwrap(), 1);
1211        assert_eq!(d.intern(b"blue").unwrap(), 2);
1212        // Re-intern returns existing code.
1213        assert_eq!(d.intern(b"red").unwrap(), 0);
1214        assert_eq!(d.intern(b"green").unwrap(), 1);
1215        assert_eq!(d.len(), 3);
1216    }
1217
1218    #[test]
1219    fn dict_get_round_trips_bytes() {
1220        let mut d = ByteDictionary::new();
1221        d.intern(b"red").unwrap();
1222        d.intern(b"green").unwrap();
1223        d.intern(b"blue").unwrap();
1224        assert_eq!(d.get(0).unwrap(), b"red");
1225        assert_eq!(d.get(1).unwrap(), b"green");
1226        assert_eq!(d.get(2).unwrap(), b"blue");
1227        assert!(d.get(99).is_none());
1228    }
1229
1230    #[test]
1231    fn dict_lookup_does_not_extend() {
1232        let mut d = ByteDictionary::new();
1233        d.intern(b"red").unwrap();
1234        assert_eq!(d.lookup(b"red"), Some(0));
1235        assert_eq!(d.lookup(b"green"), None);
1236        assert_eq!(d.len(), 1);
1237    }
1238
1239    #[test]
1240    fn dict_first_seen_is_insertion_order() {
1241        // Inserting B, A, C must give codes 0, 1, 2 in that order.
1242        let mut d = ByteDictionary::new();
1243        assert_eq!(d.intern(b"B").unwrap(), 0);
1244        assert_eq!(d.intern(b"A").unwrap(), 1);
1245        assert_eq!(d.intern(b"C").unwrap(), 2);
1246        assert_eq!(d.get(0).unwrap(), b"B");
1247        assert_eq!(d.get(1).unwrap(), b"A");
1248        assert_eq!(d.get(2).unwrap(), b"C");
1249    }
1250
1251    #[test]
1252    fn dict_lexical_seal_reorders_by_bytes() {
1253        let mut d = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1254        d.intern(b"banana").unwrap();
1255        d.intern(b"apple").unwrap();
1256        d.intern(b"cherry").unwrap();
1257        let perm = d.seal_lexical();
1258        // After seal, codes are in byte-lex order.
1259        assert_eq!(d.get(0).unwrap(), b"apple");
1260        assert_eq!(d.get(1).unwrap(), b"banana");
1261        assert_eq!(d.get(2).unwrap(), b"cherry");
1262        // Permutation maps old → new.
1263        // banana was 0 → now 1
1264        // apple  was 1 → now 0
1265        // cherry was 2 → now 2
1266        assert_eq!(perm, vec![1, 0, 2]);
1267    }
1268
1269    #[test]
1270    fn dict_lexical_two_insertion_orders_seal_to_same_codes() {
1271        let mut d1 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1272        let mut d2 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1273        d1.intern(b"banana").unwrap();
1274        d1.intern(b"apple").unwrap();
1275        d1.intern(b"cherry").unwrap();
1276        d2.intern(b"cherry").unwrap();
1277        d2.intern(b"banana").unwrap();
1278        d2.intern(b"apple").unwrap();
1279        d1.seal_lexical();
1280        d2.seal_lexical();
1281        for code in 0u64..3 {
1282            assert_eq!(d1.get(code), d2.get(code), "lex ordering diverges at {code}");
1283        }
1284    }
1285
1286    #[test]
1287    fn dict_explicit_pre_populates_codes() {
1288        let d = ByteDictionary::from_explicit(vec![
1289            b"low".to_vec(),
1290            b"med".to_vec(),
1291            b"high".to_vec(),
1292        ])
1293        .unwrap();
1294        assert_eq!(d.lookup(b"low"), Some(0));
1295        assert_eq!(d.lookup(b"med"), Some(1));
1296        assert_eq!(d.lookup(b"high"), Some(2));
1297    }
1298
1299    #[test]
1300    fn dict_explicit_rejects_duplicates() {
1301        let err = ByteDictionary::from_explicit(vec![
1302            b"a".to_vec(),
1303            b"b".to_vec(),
1304            b"a".to_vec(),
1305        ]);
1306        assert!(matches!(err, Err(ByteDictError::ExplicitOrderingHasDuplicates)));
1307    }
1308
1309    #[test]
1310    fn dict_explicit_rejects_unknown_intern_when_frozen() {
1311        let mut d = ByteDictionary::from_explicit(vec![b"a".to_vec(), b"b".to_vec()]).unwrap();
1312        d.freeze();
1313        let res = d.intern(b"c");
1314        assert!(matches!(res, Err(ByteDictError::Frozen)));
1315    }
1316
1317    // ── Frozen + UnknownCategoryPolicy ──────────────────────────────────
1318
1319    #[test]
1320    fn dict_frozen_intern_errors() {
1321        let mut d = ByteDictionary::new();
1322        d.intern(b"a").unwrap();
1323        d.freeze();
1324        let err = d.intern(b"b");
1325        assert!(matches!(err, Err(ByteDictError::Frozen)));
1326    }
1327
1328    #[test]
1329    fn policy_error_returns_unknown_category() {
1330        let mut d = ByteDictionary::new();
1331        d.intern(b"a").unwrap();
1332        d.freeze();
1333        let res = d.intern_with_policy(b"b", &UnknownCategoryPolicy::Error);
1334        assert!(matches!(res, Err(ByteDictError::UnknownCategory)));
1335    }
1336
1337    #[test]
1338    fn policy_map_to_null_returns_null() {
1339        let mut d = ByteDictionary::new();
1340        d.intern(b"a").unwrap();
1341        d.freeze();
1342        let res = d.intern_with_policy(b"b", &UnknownCategoryPolicy::MapToNull);
1343        assert_eq!(res, Ok(InternedCode::Null));
1344    }
1345
1346    #[test]
1347    fn policy_map_to_other_returns_other_code() {
1348        let mut d = ByteDictionary::new();
1349        let _a = d.intern(b"a").unwrap();
1350        let other = d.intern(b"Other").unwrap();
1351        d.freeze();
1352        let res = d.intern_with_policy(
1353            b"unseen",
1354            &UnknownCategoryPolicy::MapToOther { other_code: other },
1355        );
1356        assert_eq!(res, Ok(InternedCode::Code(other)));
1357    }
1358
1359    #[test]
1360    fn policy_map_to_other_rejects_invalid_other_code() {
1361        let mut d = ByteDictionary::new();
1362        d.intern(b"a").unwrap();
1363        d.freeze();
1364        let res = d.intern_with_policy(
1365            b"x",
1366            &UnknownCategoryPolicy::MapToOther { other_code: 99 },
1367        );
1368        assert!(matches!(
1369            res,
1370            Err(ByteDictError::OtherCodeNotInDictionary { code: 99 })
1371        ));
1372    }
1373
1374    #[test]
1375    fn policy_extend_dictionary_bypasses_frozen() {
1376        let mut d = ByteDictionary::new();
1377        d.intern(b"a").unwrap();
1378        d.freeze();
1379        let res = d
1380            .intern_with_policy(b"b", &UnknownCategoryPolicy::ExtendDictionary)
1381            .unwrap();
1382        assert_eq!(res, InternedCode::Code(1));
1383        assert_eq!(d.len(), 2);
1384        // Dictionary stays frozen for subsequent strict calls.
1385        assert!(d.is_frozen());
1386        let strict = d.intern_with_policy(b"c", &UnknownCategoryPolicy::Error);
1387        assert!(matches!(strict, Err(ByteDictError::UnknownCategory)));
1388    }
1389
1390    // ── CategoricalColumn ───────────────────────────────────────────────
1391
1392    #[test]
1393    fn col_push_round_trip() {
1394        let mut c = CategoricalColumn::new();
1395        c.push(b"red").unwrap();
1396        c.push(b"blue").unwrap();
1397        c.push(b"red").unwrap();
1398        assert_eq!(c.len(), 3);
1399        assert_eq!(c.get(0).unwrap(), b"red");
1400        assert_eq!(c.get(1).unwrap(), b"blue");
1401        assert_eq!(c.get(2).unwrap(), b"red");
1402        // Two distinct categories.
1403        assert_eq!(c.dictionary().len(), 2);
1404    }
1405
1406    #[test]
1407    fn col_push_null_is_observed() {
1408        let mut c = CategoricalColumn::new();
1409        c.push(b"red").unwrap();
1410        c.push_null();
1411        c.push(b"blue").unwrap();
1412        assert_eq!(c.len(), 3);
1413        assert!(!c.is_null(0));
1414        assert!(c.is_null(1));
1415        assert!(!c.is_null(2));
1416        assert_eq!(c.get(0).unwrap(), b"red");
1417        assert!(c.get(1).is_none());
1418        assert_eq!(c.get(2).unwrap(), b"blue");
1419    }
1420
1421    #[test]
1422    fn col_seal_lexical_remaps_codes() {
1423        let mut c =
1424            CategoricalColumn::with_dictionary(ByteDictionary::with_ordering(CategoryOrdering::Lexical));
1425        c.push(b"banana").unwrap();
1426        c.push(b"apple").unwrap();
1427        c.push(b"banana").unwrap();
1428        c.push(b"cherry").unwrap();
1429        c.seal_lexical();
1430        // After seal, banana → 1, apple → 0, cherry → 2.
1431        assert_eq!(c.codes().get(0), 1, "row0 was banana → seal to 1");
1432        assert_eq!(c.codes().get(1), 0, "row1 was apple → seal to 0");
1433        assert_eq!(c.codes().get(2), 1, "row2 was banana → seal to 1");
1434        assert_eq!(c.codes().get(3), 2, "row3 was cherry → seal to 2");
1435        // Round-trip preserved.
1436        assert_eq!(c.get(0).unwrap(), b"banana");
1437        assert_eq!(c.get(1).unwrap(), b"apple");
1438        assert_eq!(c.get(2).unwrap(), b"banana");
1439        assert_eq!(c.get(3).unwrap(), b"cherry");
1440    }
1441
1442    #[test]
1443    fn col_promotion_at_256_distinct() {
1444        // Force the U8 → U16 transition by pushing 257 distinct values.
1445        let mut c = CategoricalColumn::new();
1446        for i in 0u32..257 {
1447            // Generate distinct byte strings.
1448            let s = format!("v{}", i);
1449            c.push(s.as_bytes()).unwrap();
1450        }
1451        assert_eq!(c.dictionary().len(), 257);
1452        // Code arm has been promoted to U16 (or wider).
1453        assert!(matches!(
1454            c.codes(),
1455            AdaptiveCodes::U16(_) | AdaptiveCodes::U32(_) | AdaptiveCodes::U64(_)
1456        ));
1457        // Every row round-trips.
1458        for i in 0u32..257 {
1459            let s = format!("v{}", i);
1460            assert_eq!(c.get(i as usize).unwrap(), s.as_bytes());
1461        }
1462    }
1463
1464    #[test]
1465    fn col_push_with_policy_null_records_null() {
1466        let mut c = CategoricalColumn::new();
1467        c.push(b"a").unwrap();
1468        c.push(b"b").unwrap();
1469        // Freeze the dictionary, then push an unknown with MapToNull.
1470        // We need to freeze after seeding the column, so reach in via
1471        // a fresh setup: build with explicit dict.
1472        let mut dict = ByteDictionary::new();
1473        dict.intern(b"a").unwrap();
1474        dict.intern(b"b").unwrap();
1475        dict.freeze();
1476        let mut c2 = CategoricalColumn::with_dictionary(dict);
1477        c2.push_with_policy(b"a", &UnknownCategoryPolicy::Error).unwrap();
1478        c2.push_with_policy(b"unseen", &UnknownCategoryPolicy::MapToNull)
1479            .unwrap();
1480        c2.push_with_policy(b"b", &UnknownCategoryPolicy::Error).unwrap();
1481        assert_eq!(c2.len(), 3);
1482        assert!(!c2.is_null(0));
1483        assert!(c2.is_null(1));
1484        assert!(!c2.is_null(2));
1485    }
1486
1487    // ── Profile ─────────────────────────────────────────────────────────
1488
1489    #[test]
1490    fn profile_reports_basic_stats() {
1491        let mut c = CategoricalColumn::new();
1492        c.push(b"red").unwrap();
1493        c.push(b"blue").unwrap();
1494        c.push_null();
1495        c.push(b"red").unwrap();
1496        let p = c.profile();
1497        assert_eq!(p.nrows, 4);
1498        assert_eq!(p.cardinality, 2);
1499        assert_eq!(p.missing, 1);
1500        assert_eq!(p.bytes_used, 7); // "red" + "blue"
1501        // unique_ratio: 2/4 = 0.5 = 500 thousandths
1502        assert_eq!(p.unique_ratio_thousandths, 500);
1503        assert_eq!(p.code_width_bytes, 1);
1504    }
1505
1506    // ── Determinism property ────────────────────────────────────────────
1507
1508    #[test]
1509    fn determinism_same_input_first_seen_two_dicts() {
1510        let inputs: &[&[u8]] = &[b"alpha", b"beta", b"alpha", b"gamma", b"beta", b"delta"];
1511        let mut d1 = ByteDictionary::new();
1512        let mut d2 = ByteDictionary::new();
1513        let codes1: Vec<u64> = inputs.iter().map(|b| d1.intern(b).unwrap()).collect();
1514        let codes2: Vec<u64> = inputs.iter().map(|b| d2.intern(b).unwrap()).collect();
1515        assert_eq!(codes1, codes2, "FirstSeen must be deterministic");
1516        // Sanity: alpha=0, beta=1, gamma=2, delta=3.
1517        assert_eq!(codes1, vec![0, 1, 0, 2, 1, 3]);
1518    }
1519
1520    #[test]
1521    fn determinism_lexical_two_permutations_seal_identical() {
1522        let perm1: &[&[u8]] = &[b"alpha", b"beta", b"gamma", b"delta"];
1523        let perm2: &[&[u8]] = &[b"delta", b"gamma", b"beta", b"alpha"];
1524
1525        let mut d1 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1526        let mut d2 = ByteDictionary::with_ordering(CategoryOrdering::Lexical);
1527        for s in perm1 {
1528            d1.intern(s).unwrap();
1529        }
1530        for s in perm2 {
1531            d2.intern(s).unwrap();
1532        }
1533        d1.seal_lexical();
1534        d2.seal_lexical();
1535        // After sealing, both dictionaries must have identical code → bytes
1536        // mapping for every category.
1537        for code in 0u64..4 {
1538            assert_eq!(d1.get(code), d2.get(code), "code {code}");
1539        }
1540    }
1541}