bloom_lib/hash.rs
1//! Hashing infrastructure shared by every structure.
2//!
3//! Probabilistic data structures live and die by the quality of their hash
4//! function: poor mixing inflates the false-positive rate and skews cardinality
5//! estimates. This module supplies a fast, high-quality, **deterministic**
6//! default hasher and the machinery to derive the multiple index hashes each
7//! structure needs from a single pass over the input.
8//!
9//! # Choosing a hasher
10//!
11//! Every structure is generic over [`core::hash::BuildHasher`] and defaults to
12//! [`DefaultHashBuilder`]. The default is seeded with a fixed constant, which
13//! makes hashing reproducible: two filters built with the default hasher are
14//! always mergeable, and serialized state round-trips byte-for-byte across
15//! runs. The trade-off is that a fixed seed offers no defence against an
16//! adversary who crafts inputs to collide on purpose. When the inputs are
17//! untrusted, substitute a randomly-seeded [`BuildHasher`] such as
18//! [`std::collections::hash_map::RandomState`]:
19//!
20//! ```
21//! # #[cfg(feature = "std")] {
22//! use std::collections::hash_map::RandomState;
23//! use bloom_lib::BloomFilter;
24//!
25//! let filter: BloomFilter<&str, RandomState> =
26//! BloomFilter::with_hasher(1_000, 0.01, RandomState::new()).unwrap();
27//! # }
28//! ```
29
30use core::hash::{BuildHasher, Hasher};
31
32/// Fixed seed for [`DefaultHasher`]. Chosen for a good bit distribution; the
33/// value itself is arbitrary but must stay constant for reproducibility.
34const SEED: u64 = 0x2545_F491_4F6C_DD1D;
35
36/// Odd multiplier used by the block mixer. Large, odd, and high-entropy.
37const MIX_A: u64 = 0xFF51_AFD7_ED55_8CCD;
38
39/// Second multiplier, applied in the finalizer to decorrelate the result.
40const MIX_B: u64 = 0xC4CE_B9FE_1A85_EC53;
41
42/// 64x64 -> 64 multiply-fold, the core mixing step (the "wymum" primitive).
43///
44/// Multiplying two 64-bit words yields a 128-bit product whose high and low
45/// halves are then folded together with XOR. This propagates every input bit
46/// across the whole output and is the engine behind several modern hashes.
47#[inline(always)]
48fn fold_mul(a: u64, b: u64) -> u64 {
49 let product = (a as u128).wrapping_mul(b as u128);
50 (product as u64) ^ ((product >> 64) as u64)
51}
52
53/// A fast, non-cryptographic 64-bit hasher with strong avalanche behaviour.
54///
55/// `DefaultHasher` folds the input eight bytes at a time through a 64x64-bit
56/// multiply-fold and mixes the total length into the finalizer, so inputs that
57/// differ only by trailing zero bytes do not collide. It is **not** suitable for
58/// authentication or any setting where collision resistance against an
59/// adversary is required; use a cryptographic hash for that.
60///
61/// You rarely construct this directly. It is produced by
62/// [`DefaultHashBuilder`], which the structures use by default.
63///
64/// # Examples
65///
66/// ```
67/// use core::hash::{Hash, Hasher};
68/// use bloom_lib::hash::DefaultHasher;
69///
70/// let mut hasher = DefaultHasher::default();
71/// "probabilistic".hash(&mut hasher);
72/// let digest = hasher.finish();
73///
74/// // Hashing the same value again yields the same digest.
75/// let mut again = DefaultHasher::default();
76/// "probabilistic".hash(&mut again);
77/// assert_eq!(digest, again.finish());
78/// ```
79#[derive(Debug, Clone)]
80pub struct DefaultHasher {
81 state: u64,
82 len: u64,
83}
84
85impl DefaultHasher {
86 /// Creates a hasher seeded with the given value.
87 ///
88 /// Two hashers created with the same seed produce identical digests for
89 /// identical inputs. [`DefaultHasher::default`] uses a fixed library seed.
90 ///
91 /// # Examples
92 ///
93 /// ```
94 /// use core::hash::Hasher;
95 /// use bloom_lib::hash::DefaultHasher;
96 ///
97 /// let mut a = DefaultHasher::with_seed(42);
98 /// let mut b = DefaultHasher::with_seed(7);
99 /// a.write(b"payload");
100 /// b.write(b"payload");
101 /// assert_ne!(a.finish(), b.finish());
102 /// ```
103 #[inline]
104 #[must_use]
105 pub const fn with_seed(seed: u64) -> Self {
106 Self {
107 state: seed,
108 len: 0,
109 }
110 }
111}
112
113impl Default for DefaultHasher {
114 #[inline]
115 fn default() -> Self {
116 Self::with_seed(SEED)
117 }
118}
119
120impl Hasher for DefaultHasher {
121 #[inline]
122 fn write(&mut self, bytes: &[u8]) {
123 self.len = self.len.wrapping_add(bytes.len() as u64);
124
125 let mut rest = bytes;
126 while rest.len() >= 8 {
127 let mut block = [0u8; 8];
128 block.copy_from_slice(&rest[..8]);
129 self.state = fold_mul(self.state ^ u64::from_le_bytes(block), MIX_A);
130 rest = &rest[8..];
131 }
132
133 if !rest.is_empty() {
134 let mut block = [0u8; 8];
135 block[..rest.len()].copy_from_slice(rest);
136 self.state = fold_mul(self.state ^ u64::from_le_bytes(block), MIX_A);
137 }
138 }
139
140 #[inline]
141 fn finish(&self) -> u64 {
142 fold_mul(self.state ^ self.len, MIX_B)
143 }
144}
145
146/// A [`BuildHasher`] that yields [`DefaultHasher`] instances seeded with a fixed
147/// constant.
148///
149/// This is the default hasher for every structure in the crate. Because the
150/// seed is fixed, the resulting hashing is deterministic: ideal for
151/// reproducible builds, mergeable filters, and stable serialized state. It is a
152/// zero-sized type, so embedding it in a structure costs nothing.
153///
154/// # Examples
155///
156/// ```
157/// use core::hash::BuildHasher;
158/// use bloom_lib::hash::DefaultHashBuilder;
159///
160/// let builder = DefaultHashBuilder::default();
161/// let h1 = builder.hash_one("key");
162/// let h2 = builder.hash_one("key");
163/// assert_eq!(h1, h2);
164/// ```
165#[derive(Debug, Clone, Copy, Default)]
166#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
167pub struct DefaultHashBuilder;
168
169impl BuildHasher for DefaultHashBuilder {
170 type Hasher = DefaultHasher;
171
172 #[inline]
173 fn build_hasher(&self) -> Self::Hasher {
174 DefaultHasher::default()
175 }
176}
177
178/// A pair of independent 64-bit hashes used to synthesise an arbitrary number
179/// of index hashes via the Kirsch-Mitzenmacher double-hashing scheme.
180///
181/// The structures need `k` distinct hash positions per item but hash the item
182/// only once. From the base digest `h1` and a decorrelated companion `h2`, the
183/// `i`-th position is `h1 + i * h2`. This yields `k` well-distributed hashes at
184/// the cost of a single hashing pass.
185#[cfg(feature = "alloc")]
186#[derive(Debug, Clone, Copy)]
187pub(crate) struct HashPair {
188 h1: u64,
189 h2: u64,
190}
191
192#[cfg(feature = "alloc")]
193impl HashPair {
194 /// Hashes `item` once with `build_hasher` and derives the companion hash.
195 #[inline]
196 pub(crate) fn new<T, S>(item: &T, build_hasher: &S) -> Self
197 where
198 T: core::hash::Hash + ?Sized,
199 S: BuildHasher,
200 {
201 let h1 = build_hasher.hash_one(item);
202 // Derive a second, decorrelated hash from the first with a strong mix.
203 // Forcing it odd guarantees the `h1 + i * h2` sequence visits a long
204 // cycle of residues regardless of the modulus.
205 let h2 = fold_mul(h1, MIX_B) | 1;
206 Self { h1, h2 }
207 }
208
209 /// The `i`-th synthesised 64-bit hash.
210 #[inline(always)]
211 pub(crate) fn nth(&self, i: u64) -> u64 {
212 self.h1.wrapping_add(i.wrapping_mul(self.h2))
213 }
214}
215
216/// Maps a 64-bit hash uniformly into `[0, range)` without a division.
217///
218/// This is Lemire's multiply-shift reduction: it takes the high 64 bits of
219/// `hash * range`, which is both faster than `hash % range` and free of the
220/// modulo bias that `%` introduces when `range` is not a power of two.
221#[cfg(feature = "alloc")]
222#[inline(always)]
223pub(crate) fn reduce(hash: u64, range: u64) -> u64 {
224 (((hash as u128).wrapping_mul(range as u128)) >> 64) as u64
225}
226
227/// A strong 64-bit avalanche mix, used to derive a secondary hash from a small
228/// value (such as a Cuckoo fingerprint) where the input has little entropy.
229#[cfg(feature = "alloc")]
230#[inline(always)]
231pub(crate) fn mix64(x: u64) -> u64 {
232 fold_mul(x ^ MIX_B, MIX_A)
233}