Skip to main content

thread_utilities/
hash_help.rs

1// SPDX-FileCopyrightText: 2025 Knitli Inc. <knitli@knit.li>
2// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
3// SPDX-License-Identifier: AGPL-3.0-or-later
4//! Hash map, set, and related hashing utilities.
5//!
6//! Thread uses [`rapidhash::RapidHashMap`] and [`rapidhash::RapidHashSet`] as stand-ins for
7//! `std::collections::HashMap` and `std::collections::HashSet` (they ARE `std::collections::HashMap` and
8//! `std::collections::HashSet`, but using the [`rapidhash::fast::RandomState`] hash builder.)
9//!
10//! For Thread's expected workloads, it's *very fast* and sufficiently secure for our needs.
11//! // Important to note that `rapidhash` is not a cryptographic hash, and while it's a high quality hash that's optimal in most ways, it hasn't been thoroughly tested for `HashDoD` resistance.
12//! For how we use it, this isn't a concern. We also use random seeds for the hash builder, so it should be resistant to hash collision attacks.
13
14use rapidhash::fast::RandomState;
15
16// export RapidHasher for use as a type
17pub use rapidhash::fast::RapidHasher as RapidInlineHasher;
18
19// These are effectively aliases for `rapidhash::RapidHashMap` and `rapidhash::RapidHashSet`
20// They're less of a mouthful, and we avoid type aliasing a type alias
21/// A type alias for `[rapidhash::RapidHashMap]`.
22pub type RapidMap<K, V> = rapidhash::RapidHashMap<K, V>;
23/// A type alias for `[rapidhash::RapidHashSet]`.
24pub type RapidSet<T> = rapidhash::RapidHashSet<T>;
25
26/// Creates a new `RapidMap` with the specified capacity; returning the initialized map for use.
27#[inline(always)]
28#[must_use]
29pub fn map_with_capacity<K, V>(capacity: usize) -> RapidMap<K, V>
30where
31    K: std::hash::Hash + Eq,
32    V: Default,
33{
34    RapidMap::with_capacity_and_hasher(capacity, RandomState::default())
35}
36
37/// Creates a new `RapidInlineHashSet` with the specified capacity; returning the initialized set for use.
38#[inline(always)]
39#[must_use]
40pub fn set_with_capacity<T>(capacity: usize) -> RapidSet<T>
41where
42    T: std::hash::Hash + Eq,
43{
44    RapidSet::with_capacity_and_hasher(capacity, RandomState::default())
45}
46
47/// Returns a new `RapidMap` with default values.
48#[inline(always)]
49#[must_use]
50pub fn get_map<K, V>() -> RapidMap<K, V> {
51    RapidMap::default()
52}
53
54/// Returns a new `RapidSet` with default values (a [`rapidhash::RapidHashSet`]).
55#[inline(always)]
56#[must_use]
57pub fn get_set<T>() -> RapidSet<T> {
58    RapidSet::default()
59}
60
61/// Computes a hash for a [`std::fs::File`] object using `rapidhash`.
62#[inline(always)]
63pub fn hash_file(file: &mut std::fs::File) -> Result<u64, std::io::Error> {
64    rapidhash::v3::rapidhash_v3_file(file).map_err(std::io::Error::other)
65}
66
67/// Computes a hash for a [`std::fs::File`] object using `rapidhash` with a specified seed.
68pub fn hash_file_with_seed(file: &mut std::fs::File, seed: u64) -> Result<u64, std::io::Error> {
69    let secrets = rapidhash::v3::RapidSecrets::seed(seed);
70    rapidhash::v3::rapidhash_v3_file_seeded(file, &secrets).map_err(std::io::Error::other)
71}
72
73/// Computes a hash for a byte slice using `rapidhash`.
74#[inline(always)]
75#[must_use]
76pub const fn hash_bytes(bytes: &[u8]) -> u64 {
77    rapidhash::v3::rapidhash_v3(bytes)
78}
79
80/// Computes a hash for a byte slice using `rapidhash` with a specified seed.
81#[inline(always)]
82#[must_use]
83pub const fn hash_bytes_with_seed(bytes: &[u8], seed: u64) -> u64 {
84    // Note: RapidSecrets::seed is const, so this should be fine in a const fn
85    let secrets = rapidhash::v3::RapidSecrets::seed(seed);
86    rapidhash::v3::rapidhash_v3_seeded(bytes, &secrets)
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92    use std::collections::HashSet;
93    use std::io::Write;
94
95    // Test constants
96    const HASH_DISTRIBUTION_TEST_SIZE: usize = 1000;
97    const HASH_DISTRIBUTION_MIN_UNIQUENESS: usize = 95; // 95% uniqueness threshold
98    const LARGE_FILE_SIZE: usize = 100_000;
99
100    // Tests for hash_bytes
101    #[test]
102    fn test_hash_bytes_empty() {
103        let hash = hash_bytes(&[]);
104        // Should return a consistent hash for empty input
105        assert_eq!(hash, hash_bytes(&[]));
106    }
107
108    #[test]
109    fn test_hash_bytes_simple() {
110        let data = b"hello world";
111        let hash = hash_bytes(data);
112        // Should be deterministic
113        assert_eq!(hash, hash_bytes(data));
114    }
115
116    #[test]
117    fn test_hash_bytes_different_inputs() {
118        let hash1 = hash_bytes(b"hello");
119        let hash2 = hash_bytes(b"world");
120        let hash3 = hash_bytes(b"hello world");
121
122        // Different inputs should produce different hashes
123        assert_ne!(hash1, hash2);
124        assert_ne!(hash1, hash3);
125        assert_ne!(hash2, hash3);
126    }
127
128    #[test]
129    fn test_hash_bytes_deterministic() {
130        let data = b"The quick brown fox jumps over the lazy dog";
131        let hash1 = hash_bytes(data);
132        let hash2 = hash_bytes(data);
133
134        assert_eq!(hash1, hash2, "Hash should be deterministic");
135    }
136
137    #[test]
138    fn test_hash_bytes_avalanche() {
139        // Test that small changes in input produce different hashes (avalanche effect)
140        let hash1 = hash_bytes(b"test");
141        let hash2 = hash_bytes(b"Test"); // Single bit change
142        let hash3 = hash_bytes(b"test1"); // Additional character
143
144        assert_ne!(hash1, hash2);
145        assert_ne!(hash1, hash3);
146        assert_ne!(hash2, hash3);
147    }
148
149    #[test]
150    fn test_hash_bytes_large_input() {
151        // Test with larger input
152        let large_data = vec![0u8; 10000];
153        let hash1 = hash_bytes(&large_data);
154
155        // Should be deterministic even for large inputs
156        assert_eq!(hash1, hash_bytes(&large_data));
157
158        // Slightly different large input
159        let mut large_data2 = large_data.clone();
160        large_data2[5000] = 1;
161        let hash2 = hash_bytes(&large_data2);
162
163        assert_ne!(hash1, hash2);
164    }
165
166    #[test]
167    fn test_hash_bytes_various_sizes() {
168        // Test various input sizes to exercise different code paths
169        for size in [
170            0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 1023, 1024,
171        ] {
172            let data = vec![0u8; size];
173            let hash = hash_bytes(&data);
174            // Should be deterministic
175            assert_eq!(hash, hash_bytes(&data), "Failed for size {size}");
176        }
177    }
178
179    // Tests for hash_bytes_with_seed
180    #[test]
181    fn test_hash_bytes_with_seed_deterministic() {
182        let data = b"test data";
183        let seed = 12345u64;
184
185        let hash1 = hash_bytes_with_seed(data, seed);
186        let hash2 = hash_bytes_with_seed(data, seed);
187
188        assert_eq!(hash1, hash2, "Hash with seed should be deterministic");
189    }
190
191    #[test]
192    fn test_hash_bytes_with_seed_different_seeds() {
193        let data = b"test data";
194
195        let hash1 = hash_bytes_with_seed(data, 1);
196        let hash2 = hash_bytes_with_seed(data, 2);
197        let hash3 = hash_bytes_with_seed(data, 3);
198
199        // Different seeds should produce different hashes
200        assert_ne!(hash1, hash2);
201        assert_ne!(hash1, hash3);
202        assert_ne!(hash2, hash3);
203    }
204
205    #[test]
206    fn test_hash_bytes_with_seed_empty() {
207        let seed = 42u64;
208        let hash1 = hash_bytes_with_seed(&[], seed);
209        let hash2 = hash_bytes_with_seed(&[], seed);
210
211        assert_eq!(hash1, hash2);
212    }
213
214    #[test]
215    fn test_hash_bytes_with_seed_distribution() {
216        // Test that different seeds produce well-distributed hashes
217        let data = b"test";
218        let mut hashes = HashSet::new();
219
220        for seed in 0..100 {
221            let hash = hash_bytes_with_seed(data, seed);
222            hashes.insert(hash);
223        }
224
225        // Should have high uniqueness (allowing for small collision chance)
226        assert!(
227            hashes.len() >= HASH_DISTRIBUTION_MIN_UNIQUENESS,
228            "Expected high hash distribution, got {} unique hashes out of 100",
229            hashes.len()
230        );
231    }
232
233    // Tests for hash_file
234    #[test]
235    fn test_hash_file_empty() -> Result<(), std::io::Error> {
236        let mut temp_file = tempfile::NamedTempFile::new()?;
237        temp_file.flush()?;
238
239        let mut file = temp_file.reopen()?;
240        let hash1 = hash_file(&mut file)?;
241
242        // Reopen and hash again
243        let mut file = temp_file.reopen()?;
244        let hash2 = hash_file(&mut file)?;
245
246        assert_eq!(hash1, hash2, "Empty file hash should be deterministic");
247        Ok(())
248    }
249
250    #[test]
251    fn test_hash_file_simple() -> Result<(), std::io::Error> {
252        let mut temp_file = tempfile::NamedTempFile::new()?;
253        temp_file.write_all(b"hello world")?;
254        temp_file.flush()?;
255
256        let mut file = temp_file.reopen()?;
257        let hash1 = hash_file(&mut file)?;
258
259        // Reopen and hash again
260        let mut file = temp_file.reopen()?;
261        let hash2 = hash_file(&mut file)?;
262
263        assert_eq!(hash1, hash2, "File hash should be deterministic");
264        Ok(())
265    }
266
267    #[test]
268    fn test_hash_file_different_contents() -> Result<(), std::io::Error> {
269        let mut temp_file1 = tempfile::NamedTempFile::new()?;
270        temp_file1.write_all(b"hello")?;
271        temp_file1.flush()?;
272
273        let mut temp_file2 = tempfile::NamedTempFile::new()?;
274        temp_file2.write_all(b"world")?;
275        temp_file2.flush()?;
276
277        let mut file1 = temp_file1.reopen()?;
278        let hash1 = hash_file(&mut file1)?;
279
280        let mut file2 = temp_file2.reopen()?;
281        let hash2 = hash_file(&mut file2)?;
282
283        assert_ne!(
284            hash1, hash2,
285            "Different file contents should produce different hashes"
286        );
287        Ok(())
288    }
289
290    #[test]
291    fn test_hash_file_large() -> Result<(), std::io::Error> {
292        let mut temp_file = tempfile::NamedTempFile::new()?;
293        let large_data = vec![0xABu8; LARGE_FILE_SIZE];
294        temp_file.write_all(&large_data)?;
295        temp_file.flush()?;
296
297        let mut file = temp_file.reopen()?;
298        let hash1 = hash_file(&mut file)?;
299
300        // Reopen and hash again
301        let mut file = temp_file.reopen()?;
302        let hash2 = hash_file(&mut file)?;
303
304        assert_eq!(hash1, hash2, "Large file hash should be deterministic");
305        Ok(())
306    }
307
308    #[test]
309    fn test_hash_file_vs_hash_bytes_consistency() -> Result<(), std::io::Error> {
310        let data = b"test data for consistency check";
311
312        let mut temp_file = tempfile::NamedTempFile::new()?;
313        temp_file.write_all(data)?;
314        temp_file.flush()?;
315
316        let mut file = temp_file.reopen()?;
317        let file_hash = hash_file(&mut file)?;
318
319        let bytes_hash = hash_bytes(data);
320
321        assert_eq!(
322            file_hash, bytes_hash,
323            "File hash should match byte hash for same content"
324        );
325        Ok(())
326    }
327
328    // Tests for hash_file_with_seed
329    #[test]
330    fn test_hash_file_with_seed_deterministic() -> Result<(), std::io::Error> {
331        let mut temp_file = tempfile::NamedTempFile::new()?;
332        temp_file.write_all(b"test data")?;
333        temp_file.flush()?;
334
335        let seed = 12345u64;
336
337        let mut file1 = temp_file.reopen()?;
338        let hash1 = hash_file_with_seed(&mut file1, seed)?;
339
340        let mut file2 = temp_file.reopen()?;
341        let hash2 = hash_file_with_seed(&mut file2, seed)?;
342
343        assert_eq!(hash1, hash2, "File hash with seed should be deterministic");
344        Ok(())
345    }
346
347    #[test]
348    fn test_hash_file_with_seed_different_seeds() -> Result<(), std::io::Error> {
349        let mut temp_file = tempfile::NamedTempFile::new()?;
350        temp_file.write_all(b"test data")?;
351        temp_file.flush()?;
352
353        let mut file1 = temp_file.reopen()?;
354        let hash1 = hash_file_with_seed(&mut file1, 1)?;
355
356        let mut file2 = temp_file.reopen()?;
357        let hash2 = hash_file_with_seed(&mut file2, 2)?;
358
359        let mut file3 = temp_file.reopen()?;
360        let hash3 = hash_file_with_seed(&mut file3, 3)?;
361
362        assert_ne!(hash1, hash2);
363        assert_ne!(hash1, hash3);
364        assert_ne!(hash2, hash3);
365        Ok(())
366    }
367
368    #[test]
369    fn test_hash_file_with_seed_vs_hash_bytes_consistency() -> Result<(), std::io::Error> {
370        let data = b"test data for seeded consistency";
371        let seed = 42u64;
372
373        let mut temp_file = tempfile::NamedTempFile::new()?;
374        temp_file.write_all(data)?;
375        temp_file.flush()?;
376
377        let mut file = temp_file.reopen()?;
378        let file_hash = hash_file_with_seed(&mut file, seed)?;
379
380        let bytes_hash = hash_bytes_with_seed(data, seed);
381
382        assert_eq!(
383            file_hash, bytes_hash,
384            "Seeded file hash should match seeded byte hash"
385        );
386        Ok(())
387    }
388
389    // Tests for RapidMap and RapidSet helper functions
390    #[test]
391    fn test_get_map() {
392        let map: RapidMap<String, i32> = get_map();
393        assert!(map.is_empty());
394    }
395
396    #[test]
397    fn test_get_set() {
398        let set: RapidSet<String> = get_set();
399        assert!(set.is_empty());
400    }
401
402    #[test]
403    fn test_map_with_capacity() {
404        let map: RapidMap<String, i32> = map_with_capacity(100);
405        assert!(map.is_empty());
406        assert!(map.capacity() >= 100);
407    }
408
409    #[test]
410    fn test_set_with_capacity() {
411        let set: RapidSet<String> = set_with_capacity(100);
412        assert!(set.is_empty());
413        assert!(set.capacity() >= 100);
414    }
415
416    #[test]
417    fn test_rapid_map_basic_operations() {
418        let mut map: RapidMap<String, i32> = get_map();
419
420        map.insert("one".to_string(), 1);
421        map.insert("two".to_string(), 2);
422        map.insert("three".to_string(), 3);
423
424        assert_eq!(map.len(), 3);
425        assert_eq!(map.get("one"), Some(&1));
426        assert_eq!(map.get("two"), Some(&2));
427        assert_eq!(map.get("three"), Some(&3));
428        assert_eq!(map.get("four"), None);
429    }
430
431    #[test]
432    fn test_rapid_set_basic_operations() {
433        let mut set: RapidSet<String> = get_set();
434
435        set.insert("apple".to_string());
436        set.insert("banana".to_string());
437        set.insert("cherry".to_string());
438
439        assert_eq!(set.len(), 3);
440        assert!(set.contains("apple"));
441        assert!(set.contains("banana"));
442        assert!(set.contains("cherry"));
443        assert!(!set.contains("date"));
444    }
445
446    #[test]
447    fn test_rapid_map_with_capacity_usage() {
448        let mut map: RapidMap<i32, String> = map_with_capacity(10);
449
450        for i in 0..5 {
451            map.insert(i, format!("value_{i}"));
452        }
453
454        assert_eq!(map.len(), 5);
455        assert!(map.capacity() >= 10);
456    }
457
458    #[test]
459    fn test_rapid_set_with_capacity_usage() {
460        let mut set: RapidSet<i32> = set_with_capacity(10);
461
462        for i in 0..5 {
463            set.insert(i);
464        }
465
466        assert_eq!(set.len(), 5);
467        assert!(set.capacity() >= 10);
468    }
469
470    #[test]
471    fn test_rapid_map_hash_distribution() {
472        // Test that RapidMap handles hash collisions properly
473        let mut map: RapidMap<i32, String> = get_map();
474
475        for i in 0..HASH_DISTRIBUTION_TEST_SIZE {
476            map.insert(i as i32, format!("value_{i}"));
477        }
478
479        assert_eq!(map.len(), HASH_DISTRIBUTION_TEST_SIZE);
480
481        // Verify all values are retrievable
482        for i in 0..HASH_DISTRIBUTION_TEST_SIZE {
483            assert_eq!(map.get(&(i as i32)), Some(&format!("value_{i}")));
484        }
485    }
486
487    #[test]
488    fn test_rapid_set_hash_distribution() {
489        // Test that RapidSet handles hash collisions properly
490        let mut set: RapidSet<i32> = get_set();
491
492        for i in 0..HASH_DISTRIBUTION_TEST_SIZE {
493            set.insert(i as i32);
494        }
495
496        assert_eq!(set.len(), HASH_DISTRIBUTION_TEST_SIZE);
497
498        // Verify all values are present
499        for i in 0..HASH_DISTRIBUTION_TEST_SIZE {
500            assert!(set.contains(&(i as i32)));
501        }
502    }
503}