oxihuman_core/
bloom_set.rs1#![allow(dead_code)]
4
5#[allow(dead_code)]
7#[derive(Debug, Clone)]
8pub struct BloomSet {
9 bits: Vec<u64>,
10 num_bits: usize,
11 num_hashes: usize,
12 count: usize,
13}
14
15#[allow(dead_code)]
16impl BloomSet {
17 pub fn new(num_bits: usize, num_hashes: usize) -> Self {
18 let words = num_bits.max(64).div_ceil(64);
19 Self {
20 bits: vec![0u64; words],
21 num_bits: words * 64,
22 num_hashes: num_hashes.max(1),
23 count: 0,
24 }
25 }
26
27 fn hash_indices(&self, key: &str) -> Vec<usize> {
28 let mut h1: u64 = 0;
29 let mut h2: u64 = 0;
30 for (i, b) in key.bytes().enumerate() {
31 h1 = h1.wrapping_mul(31).wrapping_add(b as u64);
32 h2 = h2
33 .wrapping_mul(37)
34 .wrapping_add((b as u64).wrapping_add(i as u64));
35 }
36 (0..self.num_hashes)
37 .map(|i| {
38 let h = h1.wrapping_add((i as u64).wrapping_mul(h2));
39 (h as usize) % self.num_bits
40 })
41 .collect()
42 }
43
44 pub fn insert(&mut self, key: &str) {
45 for idx in self.hash_indices(key) {
46 let word = idx / 64;
47 let bit = idx % 64;
48 self.bits[word] |= 1u64 << bit;
49 }
50 self.count += 1;
51 }
52
53 pub fn might_contain(&self, key: &str) -> bool {
54 self.hash_indices(key).iter().all(|&idx| {
55 let word = idx / 64;
56 let bit = idx % 64;
57 (self.bits[word] & (1u64 << bit)) != 0
58 })
59 }
60
61 pub fn insert_count(&self) -> usize {
62 self.count
63 }
64
65 pub fn num_bits(&self) -> usize {
66 self.num_bits
67 }
68
69 pub fn num_hashes(&self) -> usize {
70 self.num_hashes
71 }
72
73 pub fn bits_set(&self) -> usize {
74 self.bits.iter().map(|w| w.count_ones() as usize).sum()
75 }
76
77 pub fn fill_ratio(&self) -> f64 {
78 if self.num_bits == 0 {
79 return 0.0;
80 }
81 self.bits_set() as f64 / self.num_bits as f64
82 }
83
84 pub fn clear(&mut self) {
85 for w in &mut self.bits {
86 *w = 0;
87 }
88 self.count = 0;
89 }
90
91 pub fn is_empty(&self) -> bool {
92 self.count == 0
93 }
94
95 pub fn estimated_false_positive_rate(&self) -> f64 {
96 let m = self.num_bits as f64;
97 let k = self.num_hashes as f64;
98 let n = self.count as f64;
99 (1.0 - (-k * n / m).exp()).powf(k)
100 }
101}
102
103#[cfg(test)]
104mod tests {
105 use super::*;
106
107 #[test]
108 fn test_new() {
109 let bs = BloomSet::new(256, 3);
110 assert!(bs.is_empty());
111 assert_eq!(bs.num_hashes(), 3);
112 }
113
114 #[test]
115 fn test_insert_and_query() {
116 let mut bs = BloomSet::new(1024, 3);
117 bs.insert("hello");
118 assert!(bs.might_contain("hello"));
119 }
120
121 #[test]
122 fn test_likely_absent() {
123 let bs = BloomSet::new(1024, 3);
124 assert!(!bs.might_contain("never_inserted"));
125 }
126
127 #[test]
128 fn test_insert_count() {
129 let mut bs = BloomSet::new(1024, 3);
130 bs.insert("a");
131 bs.insert("b");
132 assert_eq!(bs.insert_count(), 2);
133 }
134
135 #[test]
136 fn test_bits_set() {
137 let mut bs = BloomSet::new(1024, 3);
138 assert_eq!(bs.bits_set(), 0);
139 bs.insert("key");
140 assert!(bs.bits_set() > 0);
141 }
142
143 #[test]
144 fn test_fill_ratio() {
145 let bs = BloomSet::new(1024, 3);
146 assert!((bs.fill_ratio()).abs() < f64::EPSILON);
147 }
148
149 #[test]
150 fn test_clear() {
151 let mut bs = BloomSet::new(256, 2);
152 bs.insert("x");
153 bs.clear();
154 assert!(bs.is_empty());
155 assert_eq!(bs.bits_set(), 0);
156 }
157
158 #[test]
159 fn test_false_positive_rate_zero() {
160 let bs = BloomSet::new(1024, 3);
161 assert!((bs.estimated_false_positive_rate()).abs() < f64::EPSILON);
162 }
163
164 #[test]
165 fn test_multiple_inserts() {
166 let mut bs = BloomSet::new(2048, 4);
167 for i in 0..50 {
168 bs.insert(&format!("key_{i}"));
169 }
170 for i in 0..50 {
171 assert!(bs.might_contain(&format!("key_{i}")));
172 }
173 }
174
175 #[test]
176 fn test_num_bits_rounded() {
177 let bs = BloomSet::new(100, 2);
178 assert!(bs.num_bits() >= 100);
179 assert_eq!(bs.num_bits() % 64, 0);
180 }
181}