cjc_data/detcoll/mod.rs
1//! Deterministic collection family — Phase 7.
2//!
3//! A small set of collections each tuned to one workload shape, sharing a
4//! determinism contract:
5//!
6//! 1. **No randomized hashing.** All hash functions are fixed
7//! (`splitmix64` for `DHarht` and `DetOpenMap`).
8//! 2. **No pointer-address ordering.** Iteration order is driven by
9//! keys, sort order, or insertion order — never raw addresses.
10//! 3. **Bounded behavior.** Every structure has a documented worst
11//! case; failure mode is deterministic fallback (typically
12//! `BTreeMap`), never silent corruption.
13//! 4. **Full key equality on success.** Hash collision never returns
14//! a wrong value.
15//!
16//! # When to use which
17//!
18//! | Workload | Pick |
19//! |-------------------------------------------|----------------|
20//! | Tiny maps (≤ ~16 entries) | `TinyDetMap` |
21//! | Small sealed sorted maps | `SortedVecMap` |
22//! | Dense `IdType -> Value` ID tables | `IndexVec` |
23//! | Sparse mutable equality lookup | `DetOpenMap` |
24//! | Large sealed equality lookup, prefix-heavy| `DHarht` |
25//! | Range / prefix queries / canonical output | `BTreeMap` |
26//!
27//! `DHarht` is **not** a global `BTreeMap` replacement. It is best for
28//! byte-addressable, sealed/read-heavy, deterministic equality lookup
29//! workloads. `BTreeMap` remains the choice for canonical ordering,
30//! diagnostics, serialization, and range behavior.
31
32pub mod dharht;
33pub mod dharht_memory;
34pub mod det_open;
35pub mod index_vec;
36pub mod sealed_u64_map;
37pub mod sorted_vec_map;
38pub mod tiny_det_map;
39
40pub use dharht::{DHarht, LookupProfile, MicroBucket};
41pub use dharht_memory::DHarhtMemory;
42pub use det_open::DetOpenMap;
43pub use index_vec::{Idx, IndexVec};
44pub use sealed_u64_map::SealedU64Map;
45pub use sorted_vec_map::SortedVecMap;
46pub use tiny_det_map::TinyDetMap;
47
48/// Splitmix64 mixer used as the deterministic hash function across
49/// `DHarht` and `DetOpenMap`. Pure function: same input → same output
50/// on every machine, every architecture, every locale.
51///
52/// This is the same mixing function `cjc-repro` uses for its RNG —
53/// reusing it here keeps the workspace's "one deterministic mixer" rule.
54#[inline(always)]
55pub(crate) fn splitmix64(mut x: u64) -> u64 {
56 x = x.wrapping_add(0x9E3779B97F4A7C15);
57 x = (x ^ (x >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
58 x = (x ^ (x >> 27)).wrapping_mul(0x94D049BB133111EB);
59 x ^ (x >> 31)
60}
61
62/// Hash arbitrary bytes deterministically. Used as the keying function
63/// for `DHarht::insert_bytes` and (post-Phase-11) as the canonical
64/// 64-bit content hash for `ByteDictionary::seal_with_u64_hash_index`.
65///
66/// **Phase 9: switched from splitmix64+FNV folding to multiplicative
67/// hash on 8-byte chunks.** Same determinism contract (no platform-
68/// dependent seeds, byte-equal across runs / machines), but ~5× faster
69/// on short keys because we skip the inner splitmix64 mixer per chunk.
70/// Final mix at the end folds in length to maintain distribution
71/// quality.
72#[inline]
73pub fn hash_bytes(bytes: &[u8]) -> u64 {
74 // FxHash-style multiplicative constant (golden ratio in 64-bit).
75 // Fixed across runs — no random seed.
76 const K: u64 = 0x517cc1b727220a95;
77 let mut h: u64 = 0xCBF29CE484222325; // FNV offset basis as initial seed
78 let mut chunks = bytes.chunks_exact(8);
79 for c in &mut chunks {
80 let v = u64::from_le_bytes([c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]]);
81 h = h.rotate_left(5).wrapping_mul(K) ^ v;
82 }
83 let rem = chunks.remainder();
84 if !rem.is_empty() {
85 let mut tail = [0u8; 8];
86 tail[..rem.len()].copy_from_slice(rem);
87 let v = u64::from_le_bytes(tail);
88 h = h.rotate_left(5).wrapping_mul(K) ^ v;
89 }
90 // Final avalanche — splitmix64 finalizer ensures shard / front
91 // indices are well-distributed. Cost: paid once per hash, not
92 // per chunk.
93 splitmix64(h ^ (bytes.len() as u64))
94}