spider_util/
bloom_filter.rs1use seahash::hash;
2use std::collections::hash_map::DefaultHasher;
3use std::hash::{Hash, Hasher};
4
5pub struct BloomFilter {
9 bit_set: Vec<u64>,
10 num_bits: u64,
11 hash_functions: usize,
12}
13
14impl BloomFilter {
15 pub fn new(num_bits: u64, hash_functions: usize) -> Self {
17 let size = ((num_bits as f64 / 64.0).ceil() as usize).max(1);
18 Self {
19 bit_set: vec![0; size],
20 num_bits,
21 hash_functions,
22 }
23 }
24
25 pub fn add(&mut self, item: &str) {
27 for i in 0..self.hash_functions {
28 let index = self.get_bit_index(item, i);
29 let bucket_idx = (index / 64) as usize;
30 let bit_idx = (index % 64) as usize;
31
32 if bucket_idx < self.bit_set.len() {
33 self.bit_set[bucket_idx] |= 1u64 << bit_idx;
34 }
35 }
36 }
37
38 pub fn might_contain(&self, item: &str) -> bool {
41 for i in 0..self.hash_functions {
42 let index = self.get_bit_index(item, i);
43 let bucket_idx = (index / 64) as usize;
44 let bit_idx = (index % 64) as usize;
45
46 if bucket_idx >= self.bit_set.len() {
47 return false;
48 }
49
50 if (self.bit_set[bucket_idx] & (1u64 << bit_idx)) == 0 {
51 return false;
52 }
53 }
54 true
55 }
56
57 fn get_bit_index(&self, item: &str, i: usize) -> u64 {
59 let mut hasher = DefaultHasher::new();
60 item.hash(&mut hasher);
61 let hash1 = hasher.finish();
62
63 let combined = format!("{}{}", item, i);
64 let hash2 = hash(combined.as_bytes());
65
66 let combined_hash = hash1.wrapping_add((i as u64).wrapping_mul(hash2));
67 combined_hash % self.num_bits
68 }
69}
70