Skip to main content

oxc_allocator/
ident_hasher.rs

1//! A custom hasher for `Ident` keys with precomputed hashes.
2//!
3//! [`IdentBuildHasher`] creates [`IdentHasher`]s that handle two hash paths:
4//! * `Ident::hash()` — writes a precomputed `u64` via [`write_u64`](std::hash::Hasher::write_u64)
5//! * `str::hash()` — writes bytes via [`write`](std::hash::Hasher::write), then computes hash data
6//!
7//! Both paths are normalized to the same final state so `Equivalent<Ident> for str` lookups
8//! always match `Ident`-keyed inserts.
9
10use std::hash::{BuildHasher, Hasher};
11
12/// Read 4 bytes from `s` at `offset` as a little-endian `u32`.
13#[inline]
14const fn read_u32(s: &[u8], i: usize) -> u32 {
15    (s[i] as u32) | ((s[i + 1] as u32) << 8) | ((s[i + 2] as u32) << 16) | ((s[i + 3] as u32) << 24)
16}
17
18/// Fold two `u32` values into one via 64-bit multiply + XOR-fold.
19///
20/// This is the core mixer from wyhash/rapidhash. The 64-bit multiply is non-linear,
21/// so every input bit influences many output bits. The XOR-fold combines the upper
22/// and lower halves for full avalanche.
23#[inline]
24#[expect(clippy::cast_possible_truncation)]
25const fn mix(a: u32, b: u32) -> u32 {
26    let m = (a as u64).wrapping_mul(b as u64);
27    (m as u32) ^ ((m >> 32) as u32)
28}
29
30/// Compute a fast, high-quality hash of identifier bytes.
31///
32/// Design:
33/// - **Read all bytes** for short strings (≤8), sample head+middle+tail for longer
34/// - **wyhash-style mixing** — 64-bit multiply + XOR-fold provides non-linear
35///   combination where every input bit affects many output bits
36/// - **Asymmetric seeds** — XOR with different constants before mixing, so even
37///   when read windows overlap (e.g. 4-byte strings where head == tail), the two
38///   mixer inputs are always different
39/// - **`const fn`** — enables compile-time hashing for `ident!("await")` macros
40#[inline]
41pub const fn ident_hash(s: &[u8]) -> u32 {
42    // Seeds from murmur3 / splitmix
43    const SEED1: u32 = 0x9E37_79B9;
44    const SEED2: u32 = 0x85EB_CA6B;
45
46    let len = s.len();
47    if len == 0 {
48        return 0;
49    }
50
51    if len < 4 {
52        // 1-3 bytes: wyhash trick — read first, middle, last byte.
53        // Captures all bytes regardless of length (middle overlaps for len=1,2).
54        let packed = ((s[0] as u32) << 16) | ((s[len >> 1] as u32) << 8) | (s[len - 1] as u32);
55        mix(packed ^ SEED1, packed ^ SEED2)
56    } else if len <= 8 {
57        // 4-8 bytes: read first 4 + last 4 (covers all bytes, may overlap).
58        // XOR with different seeds so even when head == tail (len=4), the mix inputs differ.
59        let head = read_u32(s, 0);
60        let tail = read_u32(s, len - 4);
61        mix(head ^ SEED1, tail ^ SEED2)
62    } else if len <= 16 {
63        // 9-16 bytes: head + middle + tail (12 bytes sampled).
64        let head = read_u32(s, 0);
65        let mid = read_u32(s, (len >> 1) - 2);
66        let tail = read_u32(s, len - 4);
67        mix(head ^ mid ^ SEED1, tail ^ SEED2)
68    } else {
69        // 17+ bytes: 4 evenly-spaced samples (16 bytes total).
70        let head = read_u32(s, 0);
71        let mid1 = read_u32(s, len / 3);
72        let mid2 = read_u32(s, 2 * len / 3);
73        let tail = read_u32(s, len - 4);
74        mix(head ^ mid1 ^ SEED1, mid2 ^ tail ^ SEED2)
75    }
76}
77
78/// Pack length and hash into a single `u64`.
79///
80/// Lower 32 bits = length, upper 32 bits = hash.
81///
82/// This is used as compact precomputed data inside `Ident` and as the
83/// `Ident::hash()` payload passed into [`IdentHasher::write_u64`].
84#[inline]
85pub const fn pack_len_hash(len: u32, hash: u32) -> u64 {
86    (len as u64) | ((hash as u64) << 32)
87}
88
89/// Compose the final hash state used by hashbrown.
90///
91/// Keep upper bits derived from `hash` (for tag/SIMD filtering), while ensuring
92/// the low bits used for bucket index also contain hash entropy instead of being
93/// dominated by identifier length.
94#[inline]
95const fn hashbrown_state(len: u32, hash: u32) -> u64 {
96    let low = hash ^ len.rotate_left(16);
97    (low as u64) | ((hash as u64) << 32)
98}
99
100/// Unpack `len` (low 32 bits) and `hash` (high 32 bits) from packed `u64`.
101#[expect(clippy::cast_possible_truncation)]
102#[inline]
103const fn unpack_len_hash(packed: u64) -> (u32, u32) {
104    (packed as u32, (packed >> 32) as u32)
105}
106
107/// A [`BuildHasher`] for `Ident`-keyed hash maps.
108///
109/// Creates [`IdentHasher`]s that accept both precomputed `u64` hashes (from `Ident`)
110/// and raw byte slices (from `str` lookups).
111#[derive(Debug, Clone, Copy, Default)]
112pub struct IdentBuildHasher;
113
114impl BuildHasher for IdentBuildHasher {
115    type Hasher = IdentHasher;
116
117    #[inline]
118    fn build_hasher(&self) -> Self::Hasher {
119        IdentHasher { state: 0 }
120    }
121}
122
123/// A [`Hasher`] that handles both `Ident` (precomputed) and `str` (computed) hash paths.
124#[derive(Debug, Clone, Copy)]
125pub struct IdentHasher {
126    state: u64,
127}
128
129impl Hasher for IdentHasher {
130    /// `Ident::hash()` path: decode packed `len|hash` and normalize.
131    #[inline]
132    fn write_u64(&mut self, i: u64) {
133        let (len, hash) = unpack_len_hash(i);
134        self.state = hashbrown_state(len, hash);
135    }
136
137    /// `str::hash()` path: compute `len` + hash from bytes, then normalize.
138    #[inline]
139    #[expect(clippy::cast_possible_truncation)] // Identifier strings are always < 4GB
140    fn write(&mut self, bytes: &[u8]) {
141        let hash = ident_hash(bytes);
142        self.state = hashbrown_state(bytes.len() as u32, hash);
143    }
144
145    /// `str::hash()` writes a 0xFF terminator byte after the string bytes. Ignore it.
146    #[inline]
147    fn write_u8(&mut self, _: u8) {}
148
149    #[inline]
150    fn finish(&self) -> u64 {
151        self.state
152    }
153}
154
155#[cfg(test)]
156mod test {
157    use std::hash::{BuildHasher, Hasher};
158
159    use super::{
160        IdentBuildHasher, IdentHasher, hashbrown_state, ident_hash, pack_len_hash, unpack_len_hash,
161    };
162
163    // ---- ident_hash quality tests ----
164
165    #[test]
166    fn hash_empty() {
167        assert_eq!(ident_hash(b""), 0);
168    }
169
170    #[test]
171    fn hash_nonzero_for_all_lengths() {
172        // Every non-empty string should produce a non-zero hash
173        for s in
174            &[b"x" as &[u8], b"ab", b"abc", b"abcd", b"hello", b"useState", b"longIdentifierName"]
175        {
176            assert_ne!(
177                ident_hash(s),
178                0,
179                "hash should be non-zero for {:?}",
180                std::str::from_utf8(s)
181            );
182        }
183    }
184
185    #[test]
186    fn hash_discriminates_similar_idents() {
187        // Same-length identifiers with small differences should hash differently
188        assert_ne!(ident_hash(b"null"), ident_hash(b"void"));
189        assert_ne!(ident_hash(b"this"), ident_hash(b"that"));
190        assert_ne!(ident_hash(b"abcd"), ident_hash(b"abce"));
191        assert_ne!(ident_hash(b"useState"), ident_hash(b"useEffect"));
192        assert_ne!(ident_hash(b"fooBar"), ident_hash(b"fooBaz"));
193        assert_ne!(ident_hash(b"a"), ident_hash(b"b"));
194        assert_ne!(ident_hash(b"ab"), ident_hash(b"ba")); // spellchecker:disable-line
195    }
196
197    #[test]
198    fn hash_four_byte_no_zero_collision() {
199        // The old hash had all 4-byte strings colliding to 0. Verify that's fixed.
200        let keywords = [b"null", b"void", b"this", b"that", b"true", b"else", b"case", b"from"];
201        for kw in &keywords {
202            assert_ne!(ident_hash(*kw), 0, "4-byte keyword {kw:?} should not hash to 0");
203        }
204        // Also verify they're all distinct
205        for (i, a) in keywords.iter().enumerate() {
206            for b in &keywords[i + 1..] {
207                assert_ne!(
208                    ident_hash(*a),
209                    ident_hash(*b),
210                    "{a:?} and {b:?} should have different hashes",
211                );
212            }
213        }
214    }
215
216    #[test]
217    fn hash_long_strings_with_middle_differences() {
218        // Strings that differ only in the middle — the 17+ path must catch this
219        assert_ne!(
220            ident_hash(b"privateInterfaceWithPrivatePropertyTypes"),
221            ident_hash(b"privateInterfaceWithPrivateParmeterTypes"), // spellchecker:disable-line
222        );
223    }
224
225    // ---- pack_len_hash tests ----
226
227    #[test]
228    fn pack_round_trip() {
229        let len = 42u32;
230        let hash = 0xDEAD_BEEFu32;
231        let packed = pack_len_hash(len, hash);
232        assert_eq!((packed & 0xFFFF_FFFF) as u32, len);
233        assert_eq!((packed >> 32) as u32, hash);
234    }
235
236    // ---- IdentHasher tests ----
237
238    #[test]
239    fn hasher_write_u64_path() {
240        let mut hasher = IdentHasher { state: 0 };
241        let hash = ident_hash(b"hello");
242        let packed = pack_len_hash(5, hash);
243        hasher.write_u64(packed);
244        assert_eq!(hasher.finish(), hashbrown_state(5, hash));
245    }
246
247    #[test]
248    fn hasher_write_bytes_path() {
249        let mut hasher = IdentHasher { state: 0 };
250        hasher.write(b"hello");
251        let hash = ident_hash(b"hello");
252        let expected = hashbrown_state(5, hash);
253        assert_eq!(hasher.finish(), expected);
254    }
255
256    #[test]
257    fn str_hash_matches_ident_packed() {
258        // Simulate what str::hash() does: write bytes, then write_u8(0xFF)
259        let build_hasher = IdentBuildHasher;
260        let mut hasher = build_hasher.build_hasher();
261        hasher.write(b"fooBar");
262        hasher.write_u8(0xFF);
263        let str_hash = hasher.finish();
264
265        // Simulate what Ident::hash() does: write_u64 with packed value
266        let mut hasher2 = build_hasher.build_hasher();
267        let packed = pack_len_hash(6, ident_hash(b"fooBar"));
268        hasher2.write_u64(packed);
269        let ident_hash_val = hasher2.finish();
270
271        assert_eq!(str_hash, ident_hash_val);
272    }
273
274    #[test]
275    fn build_hasher_default() {
276        let bh = IdentBuildHasher;
277        let hasher = bh.build_hasher();
278        assert_eq!(hasher.finish(), 0);
279    }
280
281    /// Test that `str` hashing through standard `Hash` trait produces the same
282    /// result as our manual `write` path.
283    #[test]
284    fn std_str_hash_compatible() {
285        let build_hasher = IdentBuildHasher;
286        let std_hash = build_hasher.hash_one("hello");
287
288        let mut hasher2 = build_hasher.build_hasher();
289        hasher2.write(b"hello");
290        hasher2.write_u8(0xFF);
291        let manual_hash = hasher2.finish();
292
293        assert_eq!(std_hash, manual_hash);
294    }
295
296    /// Verify str and Ident hash paths agree across all length buckets.
297    #[test]
298    fn str_hash_matches_across_lengths() {
299        let test_cases = [
300            "x",                                        // 1 byte
301            "ab",                                       // 2 bytes
302            "foo",                                      // 3 bytes
303            "this",                                     // 4 bytes
304            "hello",                                    // 5 bytes
305            "fooBar",                                   // 6 bytes
306            "useState",                                 // 8 bytes
307            "useEffect",                                // 9 bytes
308            "myVariable123",                            // 13 bytes
309            "longIdentifierNm",                         // 17 bytes
310            "privateInterfaceWithPrivatePropertyTypes", // 40 bytes
311        ];
312
313        let build_hasher = IdentBuildHasher;
314        for s in &test_cases {
315            // str path
316            let str_hash = build_hasher.hash_one(*s);
317
318            // Ident path (precomputed)
319            let mut hasher = build_hasher.build_hasher();
320            #[expect(clippy::cast_possible_truncation)]
321            let packed = pack_len_hash(s.len() as u32, ident_hash(s.as_bytes()));
322            hasher.write_u64(packed);
323            let ident_hash_val = hasher.finish();
324
325            assert_eq!(str_hash, ident_hash_val, "hash mismatch for {s:?} (len={})", s.len());
326        }
327    }
328
329    #[test]
330    fn hashbrown_state_uses_hash_entropy_for_h1() {
331        let len = 6u32;
332        let hash1 = ident_hash(b"fooBar");
333        let hash2 = ident_hash(b"fooBaz");
334
335        let state1 = hashbrown_state(len, hash1);
336        let state2 = hashbrown_state(len, hash2);
337
338        // Keep tag bits tied to identifier hash.
339        assert_eq!((state1 >> 32) as u32, hash1);
340        assert_eq!((state2 >> 32) as u32, hash2);
341        // Ensure same-length identifiers don't collapse to same low bits.
342        let (low1, _) = unpack_len_hash(state1);
343        let (low2, _) = unpack_len_hash(state2);
344        assert_ne!(low1, low2);
345    }
346}