yimi_rutool/algorithms/
hash_functions.rs

1//! Hash function implementations for bloom filters
2//!
3//! This module provides various hash function implementations optimized
4//! for use in bloom filters and other probabilistic data structures.
5
6use std::collections::hash_map::DefaultHasher;
7use std::hash::{Hash, Hasher as StdHasher};
8
9/// Trait for hash functions used in bloom filters
10pub trait Hasher {
11    /// Compute hash value for the given item
12    fn hash<T: Hash + ?Sized>(&self, item: &T) -> usize;
13}
14
15/// Different hash function algorithms available
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum HashAlgorithm {
18    /// Default Rust hasher (SipHash)
19    Default,
20    /// Murmur3 hash (32-bit)
21    Murmur3,
22    /// FNV-1a hash
23    Fnv1a,
24    /// Custom seeded hash
25    Seeded(u64),
26}
27
28/// A hash function with configurable algorithm and seed
29#[derive(Debug, Clone)]
30pub struct HashFunction {
31    algorithm: HashAlgorithm,
32    seed: u64,
33}
34
35impl HashFunction {
36    /// Create a new hash function with the default algorithm
37    ///
38    /// # Examples
39    ///
40    /// ```
41    /// use yimi_rutool::algorithms::hash_functions::{HashFunction, Hasher};
42    ///
43    /// let hash_fn = HashFunction::new();
44    /// let hash_value = hash_fn.hash("hello");
45    /// assert!(hash_value > 0);
46    /// ```
47    pub fn new() -> Self {
48        HashFunction {
49            algorithm: HashAlgorithm::Default,
50            seed: 0,
51        }
52    }
53
54    /// Create a new hash function with a specific algorithm
55    ///
56    /// # Arguments
57    ///
58    /// * `algorithm` - The hash algorithm to use
59    ///
60    /// # Examples
61    ///
62    /// ```
63    /// use yimi_rutool::algorithms::hash_functions::{HashFunction, HashAlgorithm, Hasher};
64    ///
65    /// let hash_fn = HashFunction::with_algorithm(HashAlgorithm::Murmur3);
66    /// let hash_value = hash_fn.hash("hello");
67    /// assert!(hash_value > 0);
68    /// ```
69    pub fn with_algorithm(algorithm: HashAlgorithm) -> Self {
70        HashFunction { algorithm, seed: 0 }
71    }
72
73    /// Create a new hash function with a specific algorithm and seed
74    ///
75    /// # Arguments
76    ///
77    /// * `algorithm` - The hash algorithm to use
78    /// * `seed` - Seed value for the hash function
79    ///
80    /// # Examples
81    ///
82    /// ```
83    /// use yimi_rutool::algorithms::hash_functions::{HashFunction, HashAlgorithm, Hasher};
84    ///
85    /// let hash_fn = HashFunction::with_seed(HashAlgorithm::Fnv1a, 12345);
86    /// let hash_value = hash_fn.hash("hello");
87    /// assert!(hash_value > 0);
88    /// ```
89    pub fn with_seed(algorithm: HashAlgorithm, seed: u64) -> Self {
90        HashFunction { algorithm, seed }
91    }
92
93    /// Generate multiple hash functions with different seeds
94    ///
95    /// This is useful for bloom filters that need multiple independent hash functions.
96    ///
97    /// # Arguments
98    ///
99    /// * `count` - Number of hash functions to generate
100    ///
101    /// # Examples
102    ///
103    /// ```
104    /// use yimi_rutool::algorithms::hash_functions::HashFunction;
105    ///
106    /// let hash_functions = HashFunction::generate_functions(5);
107    /// assert_eq!(hash_functions.len(), 5);
108    /// ```
109    pub fn generate_functions(count: usize) -> Vec<HashFunction> {
110        let algorithms = [
111            HashAlgorithm::Default,
112            HashAlgorithm::Murmur3,
113            HashAlgorithm::Fnv1a,
114        ];
115
116        let mut functions = Vec::with_capacity(count);
117
118        for i in 0..count {
119            let algorithm = algorithms[i % algorithms.len()];
120            let seed = (i as u64).wrapping_mul(0x9e3779b97f4a7c15_u64); // Golden ratio multiplier
121            functions.push(HashFunction::with_seed(algorithm, seed));
122        }
123
124        functions
125    }
126
127    /// Get the algorithm used by this hash function
128    pub fn algorithm(&self) -> HashAlgorithm {
129        self.algorithm
130    }
131
132    /// Get the seed used by this hash function
133    pub fn seed(&self) -> u64 {
134        self.seed
135    }
136}
137
138impl Default for HashFunction {
139    fn default() -> Self {
140        Self::new()
141    }
142}
143
144impl Hasher for HashFunction {
145    fn hash<T: Hash + ?Sized>(&self, item: &T) -> usize {
146        match self.algorithm {
147            HashAlgorithm::Default => {
148                let mut hasher = DefaultHasher::new();
149                self.seed.hash(&mut hasher);
150                item.hash(&mut hasher);
151                hasher.finish() as usize
152            }
153            HashAlgorithm::Murmur3 => murmur3_hash(item, self.seed as u32),
154            HashAlgorithm::Fnv1a => fnv1a_hash(item, self.seed),
155            HashAlgorithm::Seeded(base_seed) => {
156                let mut hasher = DefaultHasher::new();
157                (base_seed ^ self.seed).hash(&mut hasher);
158                item.hash(&mut hasher);
159                hasher.finish() as usize
160            }
161        }
162    }
163}
164
165/// Murmur3 32-bit hash implementation
166///
167/// This is a simplified version of MurmurHash3_x86_32 optimized for speed.
168fn murmur3_hash<T: Hash + ?Sized>(item: &T, seed: u32) -> usize {
169    // Convert the item to bytes using a hasher
170    let mut hasher = DefaultHasher::new();
171    seed.hash(&mut hasher);
172    item.hash(&mut hasher);
173    let hash64 = hasher.finish();
174
175    // Apply Murmur3 finalizer to improve distribution
176    let mut h = hash64 as u32;
177    h ^= h >> 16;
178    h = h.wrapping_mul(0x85ebca6b);
179    h ^= h >> 13;
180    h = h.wrapping_mul(0xc2b2ae35);
181    h ^= h >> 16;
182
183    h as usize
184}
185
186/// FNV-1a hash implementation
187///
188/// Fast hash function with good distribution properties.
189fn fnv1a_hash<T: Hash + ?Sized>(item: &T, seed: u64) -> usize {
190    const FNV_OFFSET_BASIS: u64 = 0xcbf29ce484222325;
191    const FNV_PRIME: u64 = 0x100000001b3;
192
193    // Start with offset basis XORed with seed
194    let mut hash = FNV_OFFSET_BASIS ^ seed;
195
196    // Hash the item to get bytes
197    let mut hasher = DefaultHasher::new();
198    item.hash(&mut hasher);
199    let item_hash = hasher.finish();
200
201    // Apply FNV-1a algorithm to the item hash bytes
202    let bytes = item_hash.to_le_bytes();
203    for &byte in &bytes {
204        hash ^= byte as u64;
205        hash = hash.wrapping_mul(FNV_PRIME);
206    }
207
208    hash as usize
209}
210
211/// Double hashing strategy for generating multiple hash values
212///
213/// Uses the formula: h_i(x) = (h1(x) + i * h2(x)) mod m
214/// where h1 and h2 are two independent hash functions.
215pub struct DoubleHasher {
216    hasher1: HashFunction,
217    hasher2: HashFunction,
218}
219
220impl DoubleHasher {
221    /// Create a new double hasher
222    ///
223    /// # Examples
224    ///
225    /// ```
226    /// use yimi_rutool::algorithms::hash_functions::DoubleHasher;
227    ///
228    /// let double_hasher = DoubleHasher::new();
229    /// let hashes = double_hasher.hash_multiple("hello", 5, 1000);
230    /// assert_eq!(hashes.len(), 5);
231    /// ```
232    pub fn new() -> Self {
233        DoubleHasher {
234            hasher1: HashFunction::with_algorithm(HashAlgorithm::Default),
235            hasher2: HashFunction::with_algorithm(HashAlgorithm::Murmur3),
236        }
237    }
238
239    /// Create a double hasher with specific algorithms
240    ///
241    /// # Arguments
242    ///
243    /// * `algo1` - First hash algorithm
244    /// * `algo2` - Second hash algorithm
245    pub fn with_algorithms(algo1: HashAlgorithm, algo2: HashAlgorithm) -> Self {
246        DoubleHasher {
247            hasher1: HashFunction::with_algorithm(algo1),
248            hasher2: HashFunction::with_algorithm(algo2),
249        }
250    }
251
252    /// Generate multiple hash values using double hashing
253    ///
254    /// # Arguments
255    ///
256    /// * `item` - Item to hash
257    /// * `count` - Number of hash values to generate
258    /// * `max_value` - Maximum value for hash results (modulo operation)
259    ///
260    /// # Examples
261    ///
262    /// ```
263    /// use yimi_rutool::algorithms::hash_functions::DoubleHasher;
264    ///
265    /// let double_hasher = DoubleHasher::new();
266    /// let hashes = double_hasher.hash_multiple("test", 3, 100);
267    /// assert_eq!(hashes.len(), 3);
268    /// for &hash in &hashes {
269    ///     assert!(hash < 100);
270    /// }
271    /// ```
272    pub fn hash_multiple<T: Hash + ?Sized>(
273        &self,
274        item: &T,
275        count: usize,
276        max_value: usize,
277    ) -> Vec<usize> {
278        if max_value == 0 {
279            return vec![0; count];
280        }
281
282        let h1 = self.hasher1.hash(item) % max_value;
283        let h2 = self.hasher2.hash(item) % max_value;
284
285        // Ensure h2 is odd to avoid cycles
286        let h2 = if h2 % 2 == 0 {
287            (h2 + 1) % max_value
288        } else {
289            h2
290        };
291
292        let mut hashes = Vec::with_capacity(count);
293        for i in 0..count {
294            let hash = (h1 + i * h2) % max_value;
295            hashes.push(hash);
296        }
297
298        hashes
299    }
300}
301
302impl Default for DoubleHasher {
303    fn default() -> Self {
304        Self::new()
305    }
306}
307
308/// Hash function quality metrics
309pub struct HashQuality {
310    /// Distribution uniformity (0.0 to 1.0, where 1.0 is perfectly uniform)
311    pub uniformity: f64,
312    /// Collision rate for a sample set
313    pub collision_rate: f64,
314    /// Avalanche effect score (0.0 to 1.0, where 1.0 is ideal)
315    pub avalanche_score: f64,
316}
317
318/// Evaluate hash function quality with a test dataset
319///
320/// # Arguments
321///
322/// * `hasher` - Hash function to evaluate
323/// * `test_data` - Test dataset
324/// * `bucket_count` - Number of buckets for distribution testing
325///
326/// # Examples
327///
328/// ```
329/// use yimi_rutool::algorithms::hash_functions::{HashFunction, evaluate_hash_quality};
330///
331/// let hash_fn = HashFunction::new();
332/// let test_data: Vec<String> = (0..1000).map(|i| format!("item_{}", i)).collect();
333/// let quality = evaluate_hash_quality(&hash_fn, &test_data, 100);
334/// println!("Uniformity: {:.3}", quality.uniformity);
335/// ```
336pub fn evaluate_hash_quality<T: Hash + ?Sized + Clone>(
337    hasher: &HashFunction,
338    test_data: &[T],
339    bucket_count: usize,
340) -> HashQuality {
341    if test_data.is_empty() || bucket_count == 0 {
342        return HashQuality {
343            uniformity: 0.0,
344            collision_rate: 0.0,
345            avalanche_score: 0.0,
346        };
347    }
348
349    // Test distribution uniformity
350    let mut buckets = vec![0; bucket_count];
351    let mut hash_values = Vec::new();
352
353    for item in test_data {
354        let hash = hasher.hash(item);
355        hash_values.push(hash);
356        buckets[hash % bucket_count] += 1;
357    }
358
359    // Calculate uniformity using chi-square test
360    let expected = test_data.len() as f64 / bucket_count as f64;
361    let chi_square: f64 = buckets
362        .iter()
363        .map(|&observed| {
364            let diff = observed as f64 - expected;
365            diff * diff / expected
366        })
367        .sum();
368
369    // Normalize chi-square to get uniformity score (inverse relationship)
370    let uniformity = 1.0 / (1.0 + chi_square / bucket_count as f64);
371
372    // Calculate collision rate
373    let unique_hashes = {
374        let mut sorted_hashes = hash_values.clone();
375        sorted_hashes.sort_unstable();
376        sorted_hashes.dedup();
377        sorted_hashes.len()
378    };
379    let collision_rate = 1.0 - (unique_hashes as f64 / test_data.len() as f64);
380
381    // Simple avalanche test (this is a simplified version)
382    let avalanche_score = calculate_avalanche_score(hasher, test_data);
383
384    HashQuality {
385        uniformity,
386        collision_rate,
387        avalanche_score,
388    }
389}
390
391/// Calculate avalanche effect score
392fn calculate_avalanche_score<T: Hash + ?Sized + Clone>(
393    hasher: &HashFunction,
394    test_data: &[T],
395) -> f64 {
396    if test_data.is_empty() {
397        return 0.0;
398    }
399
400    let sample_size = test_data.len().min(100); // Limit sample size for performance
401    let mut bit_flip_counts = 0;
402    let mut total_comparisons = 0;
403
404    for i in 0..sample_size {
405        let hash1 = hasher.hash(&test_data[i]);
406
407        // For avalanche test, we'd need to flip bits in the input,
408        // but since we can't easily do that with generic Hash types,
409        // we'll use a simplified approach by comparing consecutive items
410        if i + 1 < sample_size {
411            let hash2 = hasher.hash(&test_data[i + 1]);
412            let xor_result = hash1 ^ hash2;
413            bit_flip_counts += xor_result.count_ones();
414            total_comparisons += std::mem::size_of::<usize>() * 8; // bits in usize
415        }
416    }
417
418    if total_comparisons == 0 {
419        return 0.0;
420    }
421
422    // Ideally, about 50% of bits should flip
423    let flip_ratio = bit_flip_counts as f64 / total_comparisons as f64;
424    1.0 - (flip_ratio - 0.5).abs() * 2.0 // Score closer to 1.0 when flip_ratio is closer to 0.5
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_hash_function_creation() {
433        let hash_fn = HashFunction::new();
434        assert_eq!(hash_fn.algorithm(), HashAlgorithm::Default);
435        assert_eq!(hash_fn.seed(), 0);
436    }
437
438    #[test]
439    fn test_hash_function_with_algorithm() {
440        let hash_fn = HashFunction::with_algorithm(HashAlgorithm::Murmur3);
441        assert_eq!(hash_fn.algorithm(), HashAlgorithm::Murmur3);
442        assert_eq!(hash_fn.seed(), 0);
443    }
444
445    #[test]
446    fn test_hash_function_with_seed() {
447        let hash_fn = HashFunction::with_seed(HashAlgorithm::Fnv1a, 12345);
448        assert_eq!(hash_fn.algorithm(), HashAlgorithm::Fnv1a);
449        assert_eq!(hash_fn.seed(), 12345);
450    }
451
452    #[test]
453    fn test_generate_functions() {
454        let functions = HashFunction::generate_functions(5);
455        assert_eq!(functions.len(), 5);
456
457        // Each function should have a different seed
458        for i in 0..functions.len() {
459            for j in (i + 1)..functions.len() {
460                assert_ne!(functions[i].seed(), functions[j].seed());
461            }
462        }
463    }
464
465    #[test]
466    fn test_hash_consistency() {
467        let hash_fn = HashFunction::new();
468        let item = "test_string";
469
470        let hash1 = hash_fn.hash(item);
471        let hash2 = hash_fn.hash(item);
472
473        assert_eq!(hash1, hash2); // Same input should produce same hash
474    }
475
476    #[test]
477    fn test_different_algorithms_produce_different_hashes() {
478        let item = "test_string";
479
480        let default_hash = HashFunction::with_algorithm(HashAlgorithm::Default).hash(item);
481        let murmur3_hash = HashFunction::with_algorithm(HashAlgorithm::Murmur3).hash(item);
482        let fnv1a_hash = HashFunction::with_algorithm(HashAlgorithm::Fnv1a).hash(item);
483
484        // Different algorithms should generally produce different hashes
485        assert_ne!(default_hash, murmur3_hash);
486        assert_ne!(default_hash, fnv1a_hash);
487        assert_ne!(murmur3_hash, fnv1a_hash);
488    }
489
490    #[test]
491    fn test_different_seeds_produce_different_hashes() {
492        let item = "test_string";
493        let algorithm = HashAlgorithm::Default;
494
495        let hash1 = HashFunction::with_seed(algorithm, 123).hash(item);
496        let hash2 = HashFunction::with_seed(algorithm, 456).hash(item);
497
498        assert_ne!(hash1, hash2); // Different seeds should produce different hashes
499    }
500
501    #[test]
502    fn test_hash_distribution() {
503        let hash_fn = HashFunction::new();
504        let bucket_count = 100;
505        let item_count = 1000;
506
507        let mut buckets = vec![0; bucket_count];
508
509        for i in 0..item_count {
510            let item = format!("item_{}", i);
511            let hash = hash_fn.hash(&item);
512            buckets[hash % bucket_count] += 1;
513        }
514
515        // Check that no bucket is empty and none is overly full
516        let min_count = buckets.iter().min().unwrap();
517        let max_count = buckets.iter().max().unwrap();
518
519        assert!(*min_count > 0); // No bucket should be empty
520        assert!(*max_count < item_count / 5); // No bucket should have more than 20% of items
521    }
522
523    #[test]
524    fn test_double_hasher() {
525        let double_hasher = DoubleHasher::new();
526        let hashes = double_hasher.hash_multiple("test", 5, 100);
527
528        assert_eq!(hashes.len(), 5);
529
530        // All hashes should be within bounds
531        for &hash in &hashes {
532            assert!(hash < 100);
533        }
534
535        // Hashes should be different (with high probability)
536        let unique_hashes: std::collections::HashSet<_> = hashes.iter().collect();
537        assert!(unique_hashes.len() > 1); // Should have at least some different values
538    }
539
540    #[test]
541    fn test_double_hasher_zero_max_value() {
542        let double_hasher = DoubleHasher::new();
543        let hashes = double_hasher.hash_multiple("test", 5, 0);
544
545        assert_eq!(hashes.len(), 5);
546        assert!(hashes.iter().all(|&h| h == 0));
547    }
548
549    #[test]
550    fn test_hash_quality_evaluation() {
551        let hash_fn = HashFunction::new();
552        let test_data: Vec<String> = (0..1000).map(|i| format!("item_{}", i)).collect();
553
554        let quality = evaluate_hash_quality(&hash_fn, &test_data, 100);
555
556        assert!(quality.uniformity >= 0.0 && quality.uniformity <= 1.0);
557        assert!(quality.collision_rate >= 0.0 && quality.collision_rate <= 1.0);
558        assert!(quality.avalanche_score >= 0.0 && quality.avalanche_score <= 1.0);
559    }
560
561    #[test]
562    fn test_hash_quality_empty_data() {
563        let hash_fn = HashFunction::new();
564        let test_data: Vec<String> = vec![];
565
566        let quality = evaluate_hash_quality(&hash_fn, &test_data, 100);
567
568        assert_eq!(quality.uniformity, 0.0);
569        assert_eq!(quality.collision_rate, 0.0);
570        assert_eq!(quality.avalanche_score, 0.0);
571    }
572
573    #[test]
574    fn test_murmur3_hash() {
575        let item = "test";
576        let hash1 = murmur3_hash(&item, 123);
577        let hash2 = murmur3_hash(&item, 123);
578        let hash3 = murmur3_hash(&item, 456);
579
580        assert_eq!(hash1, hash2); // Same input and seed should produce same hash
581        assert_ne!(hash1, hash3); // Different seed should produce different hash
582    }
583
584    #[test]
585    fn test_fnv1a_hash() {
586        let item = "test";
587        let hash1 = fnv1a_hash(&item, 123);
588        let hash2 = fnv1a_hash(&item, 123);
589        let hash3 = fnv1a_hash(&item, 456);
590
591        assert_eq!(hash1, hash2); // Same input and seed should produce same hash
592        assert_ne!(hash1, hash3); // Different seed should produce different hash
593    }
594
595    #[test]
596    fn test_seeded_algorithm() {
597        let item = "test";
598        let hash_fn1 = HashFunction::with_seed(HashAlgorithm::Seeded(999), 123);
599        let hash_fn2 = HashFunction::with_seed(HashAlgorithm::Seeded(999), 456);
600
601        let hash1 = hash_fn1.hash(&item);
602        let hash2 = hash_fn2.hash(&item);
603
604        assert_ne!(hash1, hash2); // Different seeds should produce different hashes
605    }
606}