Skip to main content

commonware_storage/
translator.rs

1//! Primitive implementations of [Translator].
2
3use commonware_utils::GOLDEN_RATIO;
4use core::hash::{BuildHasher, Hash, Hasher};
5
6/// Translate keys into a new representation (often a smaller one).
7///
8/// # Warning
9///
10/// The output of [Translator::transform] is often used as a key in a hash table. If the output is
11/// not uniformly distributed, the performance of said hash table will degrade substantially.
12pub trait Translator: Clone + BuildHasher + Send + Sync + 'static {
13    /// The type of the internal representation of keys.
14    ///
15    /// Although [Translator] is a [BuildHasher], the `Key` type must still implement [Hash] for
16    /// compatibility with any hash table that wraps [Translator]. We also require [Ord] for
17    /// compatibility with ordered collections. [Send] and [Sync] are required for thread-safe
18    /// concurrent access.
19    type Key: Ord + Hash + Copy + Send + Sync;
20
21    /// Transform a key into its new representation.
22    fn transform(&self, key: &[u8]) -> Self::Key;
23}
24
25/// A lightweight hasher for translated `uint` keys.
26///
27/// Most users typically store keys that are **already hashed** (shortened by the [Translator]).
28/// Re-hashing them with SipHash (by [std::collections::HashMap]) would waste CPU, so we give
29/// [std::collections::HashMap] this custom hasher instead:
30///
31/// * [Hasher::write_u8], [Hasher::write_u16], [Hasher::write_u32], [Hasher::write_u64] copies the
32///   input into an internal field;
33/// * [Hasher::finish] returns that value multiplied by a mixing constant.
34///
35/// # Warning
36///
37/// This hasher is not suitable for general use. If the hasher is called on a byte slice longer
38/// than `size_of::<u64>()`, it will panic.
39#[derive(Default, Clone)]
40pub struct UintIdentity {
41    value: u64,
42}
43
44impl Hasher for UintIdentity {
45    #[inline]
46    fn write(&mut self, bytes: &[u8]) {
47        assert!(bytes.len() <= 8, "UintIdenty hasher cannot handle >8 bytes");
48        // Treat the input array as a little-endian int to ensure low-order bits don't end up mostly
49        // 0s, given that we right-pad.
50        self.value = u64::from_le_bytes(cap::<8>(bytes));
51    }
52
53    #[inline]
54    fn write_u8(&mut self, i: u8) {
55        self.value = i as u64;
56    }
57
58    #[inline]
59    fn write_u16(&mut self, i: u16) {
60        self.value = i as u64;
61    }
62
63    #[inline]
64    fn write_u32(&mut self, i: u32) {
65        self.value = i as u64;
66    }
67
68    #[inline]
69    fn write_u64(&mut self, i: u64) {
70        self.value = i;
71    }
72
73    #[inline]
74    fn finish(&self) -> u64 {
75        // Multiply by the mixing constant to spread low-order bits across all 64 bits.
76        // Without this, hashbrown's h2 control bytes (top 7 bits) are all zero for small
77        // keys, defeating its SIMD fast-reject filter.
78        self.value.wrapping_mul(GOLDEN_RATIO)
79    }
80}
81
82/// Cap the key to a fixed length.
83///
84/// # Behavior
85///
86/// - If input is shorter than `N`, the output is zero-padded.
87/// - If input is longer than `N`, the output is truncated.
88/// - If input is exactly `N`, the output is identical.
89fn cap<const N: usize>(key: &[u8]) -> [u8; N] {
90    let mut capped = [0; N];
91    let len = key.len().min(N);
92    capped[..len].copy_from_slice(&key[..len]);
93    capped
94}
95
96macro_rules! define_cap_translator {
97    ($name:ident, $size:expr, $int:ty) => {
98        #[doc = concat!("Translator that caps the key to ", stringify!($size), " byte(s) and returns it packed in a ", stringify!($int), ".")]
99        #[derive(Clone, Default)]
100        pub struct $name;
101
102        impl Translator for $name {
103            // Minimal uint size for the key.
104            type Key = $int;
105
106            #[inline]
107            fn transform(&self, key: &[u8]) -> Self::Key {
108                let capped = cap::<$size>(key);
109                <$int>::from_be_bytes(capped)
110            }
111        }
112
113        // Implement the `BuildHasher` trait for `IdentityHasher`.
114        impl BuildHasher for $name {
115            type Hasher = UintIdentity;
116
117            #[inline]
118            fn build_hasher(&self) -> Self::Hasher {
119                UintIdentity::default()
120            }
121        }
122    };
123}
124
125// Define order-preserving translators for different sizes.
126define_cap_translator!(OneCap, 1, u8);
127define_cap_translator!(TwoCap, 2, u16);
128define_cap_translator!(FourCap, 4, u32);
129define_cap_translator!(EightCap, 8, u64);
130
131/// Define a special array type for which we'll implement our own lightweight hasher. This avoids the
132/// overhead of the default Array hasher which unnecessarily (for our use case) includes a length
133/// prefix.
134#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
135pub struct UnhashedArray<const N: usize> {
136    pub inner: [u8; N],
137}
138
139impl<const N: usize> Hash for UnhashedArray<N> {
140    #[inline]
141    fn hash<H: Hasher>(&self, state: &mut H) {
142        state.write(&self.inner);
143    }
144}
145
146impl<const N: usize> PartialEq<[u8; N]> for UnhashedArray<N> {
147    fn eq(&self, other: &[u8; N]) -> bool {
148        &self.inner == other
149    }
150}
151
152/// Translators for keys that are not the length of a standard integer.
153#[derive(Clone, Copy)]
154pub struct Cap<const N: usize>;
155
156impl<const N: usize> Cap<N> {
157    pub const fn new() -> Self {
158        const {
159            assert!(N <= 8 && N > 0, "Cap must be between 1 and 8");
160        };
161        Self
162    }
163}
164
165// Manually implement Default for Cap<N> so it calls new() which validates N.
166impl<const N: usize> Default for Cap<N> {
167    fn default() -> Self {
168        Self::new()
169    }
170}
171
172impl<const N: usize> Translator for Cap<N> {
173    type Key = UnhashedArray<N>;
174
175    #[inline]
176    fn transform(&self, key: &[u8]) -> Self::Key {
177        const {
178            assert!(N <= 8 && N > 0, "Cap must be between 1 and 8");
179        };
180        UnhashedArray {
181            inner: cap::<N>(key),
182        }
183    }
184}
185
186impl<const N: usize> BuildHasher for Cap<N> {
187    type Hasher = UintIdentity;
188
189    #[inline]
190    fn build_hasher(&self) -> Self::Hasher {
191        UintIdentity::default()
192    }
193}
194
195/// Collision-resistant wrapper for any [Translator].
196///
197/// Hashes the full key with a per-instance secret seed before delegating to the inner translator.
198/// This makes translated-key collisions unpredictable to an adversary who
199/// does not know the seed, similar to how [std::collections::HashMap] uses
200/// [std::collections::hash_map::RandomState] to prevent HashDoS attacks. It can also be used to
201/// ensure uniform distribution of skewed keyspaces when used by non-hashing structures such as
202/// [crate::index].
203///
204/// # No Ordering
205///
206/// Hashing destroys lexicographic key ordering. Do not use [Hashed] with ordered indices when
207/// callers rely on translated-key adjacency matching original-key adjacency (e.g., exclusion proofs
208/// in the ordered QMDB). [Hashed] is safe for unordered indices and partitioned unordered indices.
209///
210/// # `no_std` Support
211///
212/// [Hashed::new] and [Default] are available only with this crate's `std` feature. They use
213/// [ahash::RandomState::new], which requires OS-provided randomness (the `runtime-rng` feature of
214/// `ahash`, enabled by `std` here). In `no_std` environments, use [Hashed::from_seed] with an
215/// externally-sourced random seed instead.
216///
217/// # No Stability Guarantees
218///
219/// [ahash::RandomState] is used as the underlying hasher. While `ahash` is robust, its exact
220/// algorithm might change across versions. As a result, transformed outputs are not stable across
221/// Rust versions or platforms. Treat this translator as an in-memory collision-hardening mechanism,
222/// not as a stable/persisted encoding.
223///
224/// # Examples
225///
226/// ```
227/// use commonware_storage::translator::{Hashed, TwoCap, Translator};
228///
229/// # #[cfg(feature = "std")]
230/// # {
231/// // Random seed (production use, requires `std`):
232/// let t = Hashed::new(TwoCap);
233/// let _k = t.transform(b"hello");
234/// # }
235///
236/// // Deterministic seed (testing within the same toolchain/runtime):
237/// let t = Hashed::from_seed(42, TwoCap);
238/// assert_eq!(t.transform(b"hello"), t.transform(b"hello"));
239/// ```
240#[derive(Clone)]
241pub struct Hashed<T: Translator> {
242    random_state: ahash::RandomState,
243    inner: T,
244}
245
246#[cfg(feature = "std")]
247impl<T: Translator + Default> Default for Hashed<T> {
248    fn default() -> Self {
249        Self::new(T::default())
250    }
251}
252
253impl<T: Translator> Hashed<T> {
254    /// Create a new [Hashed] translator with a random seed.
255    #[cfg(feature = "std")]
256    pub fn new(inner: T) -> Self {
257        Self {
258            random_state: ahash::RandomState::new(),
259            inner,
260        }
261    }
262
263    /// Create a new [Hashed] translator with a specific seed.
264    pub const fn from_seed(seed: u64, inner: T) -> Self {
265        Self {
266            random_state: ahash::RandomState::with_seeds(seed, 0, 0, 0),
267            inner,
268        }
269    }
270}
271
272impl<T: Translator> Translator for Hashed<T> {
273    type Key = T::Key;
274
275    #[inline]
276    fn transform(&self, key: &[u8]) -> T::Key {
277        let hash_val = self.random_state.hash_one(key);
278        self.inner.transform(&hash_val.to_le_bytes())
279    }
280}
281
282impl<T: Translator> BuildHasher for Hashed<T> {
283    type Hasher = <T as BuildHasher>::Hasher;
284
285    #[inline]
286    fn build_hasher(&self) -> Self::Hasher {
287        self.inner.build_hasher()
288    }
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294    use core::hash::Hasher;
295
296    #[test]
297    fn test_one_cap() {
298        let t = OneCap;
299        assert_eq!(t.transform(b""), 0);
300        assert_eq!(t.transform(b"a"), b'a');
301        assert_eq!(t.transform(b"ab"), b'a');
302        assert_eq!(t.transform(b"abc"), b'a');
303    }
304
305    #[test]
306    fn test_two_cap() {
307        let t = TwoCap;
308        assert_eq!(t.transform(b""), 0);
309        assert_eq!(t.transform(b"abc"), t.transform(b"ab"));
310        assert!(t.transform(b"") < t.transform(b"a"));
311        assert!(t.transform(b"a") < t.transform(b"b"));
312        assert!(t.transform(b"ab") < t.transform(b"ac"));
313        assert!(t.transform(b"z") < t.transform(b"zz"));
314        assert_eq!(t.transform(b"zz"), t.transform(b"zzabc"));
315    }
316
317    #[test]
318    fn test_four_cap() {
319        let t = FourCap;
320        let t1 = t.transform(b"");
321        let t2 = t.transform(b"a");
322        let t3 = t.transform(b"abcd");
323        let t4 = t.transform(b"abcdef");
324        let t5 = t.transform(b"b");
325
326        assert_eq!(t1, 0);
327        assert!(t1 < t2);
328        assert!(t2 < t3);
329        assert_eq!(t3, t4);
330        assert!(t3 < t5);
331        assert!(t4 < t5);
332    }
333
334    #[test]
335    fn test_cap_3() {
336        let t = Cap::<3>::new();
337        assert_eq!(t.transform(b""), [0; 3]);
338        assert_eq!(t.transform(b"abc"), *b"abc");
339        assert_eq!(t.transform(b"abcdef"), *b"abc");
340        assert_eq!(t.transform(b"ab"), [b'a', b'b', 0]);
341    }
342
343    #[test]
344    fn test_cap_6() {
345        let t = Cap::<6>::new();
346        assert_eq!(t.transform(b""), [0; 6]);
347        assert_eq!(t.transform(b"abcdef"), *b"abcdef");
348        assert_eq!(t.transform(b"abcdefghi"), *b"abcdef");
349        assert_eq!(t.transform(b"abc"), [b'a', b'b', b'c', 0, 0, 0]);
350    }
351
352    #[test]
353    fn test_eight_cap() {
354        let t = EightCap;
355        let t1 = t.transform(b"");
356        let t2 = t.transform(b"a");
357        let t3 = t.transform(b"abcdefghaaaaaaa");
358        let t4 = t.transform(b"abcdefghijkzzzzzzzzzzzzzzzzzz");
359        let t5 = t.transform(b"b");
360
361        assert_eq!(t1, 0);
362        assert!(t1 < t2);
363        assert!(t2 < t3);
364        assert_eq!(t3, t4);
365        assert!(t3 < t5);
366        assert!(t4 < t5);
367    }
368
369    #[test]
370    fn identity_hasher_small_slices_differ() {
371        let hash = |bytes: &[u8]| {
372            let mut h = UintIdentity::default();
373            h.write(bytes);
374            h.finish()
375        };
376        assert_ne!(hash(b"abc"), hash(b"abd"));
377        assert_ne!(hash(b"a"), hash(b"b"));
378        assert_ne!(hash(b""), hash(b"a"));
379    }
380
381    #[test]
382    fn identity_hasher_sets_high_bits() {
383        // The mixing step must spread small values into the top 7 bits so that
384        // hashbrown's h2 SIMD filter is effective.
385        for i in [1u64, 7, 17, 255] {
386            let mut h = UintIdentity::default();
387            h.write_u64(i);
388            assert_ne!(h.finish() >> 57, 0, "high bits all zero for input {i}");
389        }
390    }
391
392    #[test]
393    fn identity_hasher_integer_writes_differ() {
394        let hash_u8 = |v: u8| {
395            let mut h = UintIdentity::default();
396            h.write_u8(v);
397            h.finish()
398        };
399        let hash_u16 = |v: u16| {
400            let mut h = UintIdentity::default();
401            h.write_u16(v);
402            h.finish()
403        };
404        let hash_u32 = |v: u32| {
405            let mut h = UintIdentity::default();
406            h.write_u32(v);
407            h.finish()
408        };
409        let hash_u64 = |v: u64| {
410            let mut h = UintIdentity::default();
411            h.write_u64(v);
412            h.finish()
413        };
414        assert_ne!(hash_u8(0), hash_u8(1));
415        assert_ne!(hash_u16(0), hash_u16(1));
416        assert_ne!(hash_u32(0), hash_u32(1));
417        assert_ne!(hash_u64(0), hash_u64(1));
418
419        // Same numeric value through different write widths must agree.
420        assert_eq!(hash_u8(7), hash_u16(7));
421        assert_eq!(hash_u16(7), hash_u32(7));
422        assert_eq!(hash_u32(7), hash_u64(7));
423    }
424
425    #[test]
426    #[should_panic]
427    fn identity_hasher_panics_on_large_write_slice() {
428        let mut h = UintIdentity::default();
429        h.write(b"too big for an int");
430    }
431
432    #[test]
433    fn test_hashed_consistency() {
434        let t = Hashed::from_seed(42, TwoCap);
435        assert_eq!(t.transform(b"hello"), t.transform(b"hello"));
436        assert_eq!(t.transform(b""), t.transform(b""));
437        assert_eq!(t.transform(b"abcdef"), t.transform(b"abcdef"));
438    }
439
440    #[test]
441    fn test_hashed_seed_determinism() {
442        let t1 = Hashed::from_seed(42, TwoCap);
443        let t2 = Hashed::from_seed(42, TwoCap);
444        assert_eq!(t1.transform(b"hello"), t2.transform(b"hello"));
445        assert_eq!(t1.transform(b"world"), t2.transform(b"world"));
446    }
447
448    #[test]
449    fn test_hashed_seed_independence() {
450        let t1 = Hashed::from_seed(1, EightCap);
451        let t2 = Hashed::from_seed(2, EightCap);
452        // Different seeds should (with overwhelming probability) produce different outputs.
453        assert_ne!(t1.transform(b"hello"), t2.transform(b"hello"));
454    }
455
456    #[test]
457    fn test_hashed_prefix_collisions_avoided() {
458        // Without hashing, keys sharing a prefix collide. With hashing, they should not.
459        let t = Hashed::from_seed(99, TwoCap);
460        let k1 = t.transform(b"abXXX");
461        let k2 = t.transform(b"abYYY");
462        // The unhashed TwoCap would map both to the same value (first 2 bytes "ab").
463        assert_ne!(k1, k2);
464    }
465
466    #[test]
467    fn test_hashed_all_cap_sizes() {
468        let t1 = Hashed::from_seed(7, OneCap);
469        assert_eq!(t1.transform(b"test"), t1.transform(b"test"));
470
471        let t4 = Hashed::from_seed(7, FourCap);
472        assert_eq!(t4.transform(b"test"), t4.transform(b"test"));
473
474        let t8 = Hashed::from_seed(7, EightCap);
475        assert_eq!(t8.transform(b"test"), t8.transform(b"test"));
476
477        let tc = Hashed::from_seed(7, Cap::<3>::new());
478        assert_eq!(tc.transform(b"test"), tc.transform(b"test"));
479    }
480
481    #[test]
482    fn test_hashed_random_seed() {
483        // Two instances with random seeds should (with overwhelming probability)
484        // produce different outputs.
485        let t1 = Hashed::new(EightCap);
486        let t2 = Hashed::new(EightCap);
487        assert_ne!(t1.transform(b"hello"), t2.transform(b"hello"));
488    }
489}