Skip to main content

oxihuman_core/
bloom_set.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5/// A probabilistic set membership test using a bloom-filter style bit array.
6#[allow(dead_code)]
7#[derive(Debug, Clone)]
8pub struct BloomSet {
9    bits: Vec<u64>,
10    num_bits: usize,
11    num_hashes: usize,
12    count: usize,
13}
14
15#[allow(dead_code)]
16impl BloomSet {
17    pub fn new(num_bits: usize, num_hashes: usize) -> Self {
18        let words = num_bits.max(64).div_ceil(64);
19        Self {
20            bits: vec![0u64; words],
21            num_bits: words * 64,
22            num_hashes: num_hashes.max(1),
23            count: 0,
24        }
25    }
26
27    fn hash_indices(&self, key: &str) -> Vec<usize> {
28        let mut h1: u64 = 0;
29        let mut h2: u64 = 0;
30        for (i, b) in key.bytes().enumerate() {
31            h1 = h1.wrapping_mul(31).wrapping_add(b as u64);
32            h2 = h2
33                .wrapping_mul(37)
34                .wrapping_add((b as u64).wrapping_add(i as u64));
35        }
36        (0..self.num_hashes)
37            .map(|i| {
38                let h = h1.wrapping_add((i as u64).wrapping_mul(h2));
39                (h as usize) % self.num_bits
40            })
41            .collect()
42    }
43
44    pub fn insert(&mut self, key: &str) {
45        for idx in self.hash_indices(key) {
46            let word = idx / 64;
47            let bit = idx % 64;
48            self.bits[word] |= 1u64 << bit;
49        }
50        self.count += 1;
51    }
52
53    pub fn might_contain(&self, key: &str) -> bool {
54        self.hash_indices(key).iter().all(|&idx| {
55            let word = idx / 64;
56            let bit = idx % 64;
57            (self.bits[word] & (1u64 << bit)) != 0
58        })
59    }
60
61    pub fn insert_count(&self) -> usize {
62        self.count
63    }
64
65    pub fn num_bits(&self) -> usize {
66        self.num_bits
67    }
68
69    pub fn num_hashes(&self) -> usize {
70        self.num_hashes
71    }
72
73    pub fn bits_set(&self) -> usize {
74        self.bits.iter().map(|w| w.count_ones() as usize).sum()
75    }
76
77    pub fn fill_ratio(&self) -> f64 {
78        if self.num_bits == 0 {
79            return 0.0;
80        }
81        self.bits_set() as f64 / self.num_bits as f64
82    }
83
84    pub fn clear(&mut self) {
85        for w in &mut self.bits {
86            *w = 0;
87        }
88        self.count = 0;
89    }
90
91    pub fn is_empty(&self) -> bool {
92        self.count == 0
93    }
94
95    pub fn estimated_false_positive_rate(&self) -> f64 {
96        let m = self.num_bits as f64;
97        let k = self.num_hashes as f64;
98        let n = self.count as f64;
99        (1.0 - (-k * n / m).exp()).powf(k)
100    }
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    #[test]
108    fn test_new() {
109        let bs = BloomSet::new(256, 3);
110        assert!(bs.is_empty());
111        assert_eq!(bs.num_hashes(), 3);
112    }
113
114    #[test]
115    fn test_insert_and_query() {
116        let mut bs = BloomSet::new(1024, 3);
117        bs.insert("hello");
118        assert!(bs.might_contain("hello"));
119    }
120
121    #[test]
122    fn test_likely_absent() {
123        let bs = BloomSet::new(1024, 3);
124        assert!(!bs.might_contain("never_inserted"));
125    }
126
127    #[test]
128    fn test_insert_count() {
129        let mut bs = BloomSet::new(1024, 3);
130        bs.insert("a");
131        bs.insert("b");
132        assert_eq!(bs.insert_count(), 2);
133    }
134
135    #[test]
136    fn test_bits_set() {
137        let mut bs = BloomSet::new(1024, 3);
138        assert_eq!(bs.bits_set(), 0);
139        bs.insert("key");
140        assert!(bs.bits_set() > 0);
141    }
142
143    #[test]
144    fn test_fill_ratio() {
145        let bs = BloomSet::new(1024, 3);
146        assert!((bs.fill_ratio()).abs() < f64::EPSILON);
147    }
148
149    #[test]
150    fn test_clear() {
151        let mut bs = BloomSet::new(256, 2);
152        bs.insert("x");
153        bs.clear();
154        assert!(bs.is_empty());
155        assert_eq!(bs.bits_set(), 0);
156    }
157
158    #[test]
159    fn test_false_positive_rate_zero() {
160        let bs = BloomSet::new(1024, 3);
161        assert!((bs.estimated_false_positive_rate()).abs() < f64::EPSILON);
162    }
163
164    #[test]
165    fn test_multiple_inserts() {
166        let mut bs = BloomSet::new(2048, 4);
167        for i in 0..50 {
168            bs.insert(&format!("key_{i}"));
169        }
170        for i in 0..50 {
171            assert!(bs.might_contain(&format!("key_{i}")));
172        }
173    }
174
175    #[test]
176    fn test_num_bits_rounded() {
177        let bs = BloomSet::new(100, 2);
178        assert!(bs.num_bits() >= 100);
179        assert_eq!(bs.num_bits() % 64, 0);
180    }
181}