Skip to main content

rustolio_utils/
bloom_filter.rs

1//
2// SPDX-License-Identifier: MPL-2.0
3//
4// Copyright (c) 2026 Tobias Binnewies. All rights reserved.
5//
6// This Source Code Form is subject to the terms of the Mozilla Public
7// License, v. 2.0. If a copy of the MPL was not distributed with this
8// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9//
10
11#![allow(deprecated)]
12
13use std::hash::{Hash, Hasher as _, SipHasher};
14
15use crate::{bit_vec::BitVector, crypto::rand, prelude::*};
16
17#[derive(Debug, Clone)]
18pub struct BloomFilter {
19    bits: BitVector,
20    k_num: u32,
21    sips: [SipHasher; 2],
22    seed: [u64; 4],
23}
24
25impl PartialEq for BloomFilter {
26    fn eq(&self, other: &Self) -> bool {
27        self.bits == other.bits && self.k_num == other.k_num && self.seed == other.seed
28    }
29}
30impl Eq for BloomFilter {}
31
32/// m=−n⋅ln(p)​ / (ln2)^2
33/// m = number of bits needed
34/// n = expected number of items
35/// p = desired false positive probability
36fn calc_filter_size(n: usize, p: f64) -> usize {
37    assert!(n > 0);
38    assert!(p > 0.0 && p < 1.0);
39    let log2_2 = std::f64::consts::LN_2 * std::f64::consts::LN_2;
40    -((n as f64) * f64::ln(p) / log2_2).ceil() as usize
41}
42
43/// k= nm​ ln2
44/// m = number of bits in the bitmap (bitmap_bits)
45/// n = expected number of items (item_count)
46/// k = optimal number of hash functions
47fn optimal_k_num(m: u64, n: usize) -> u32 {
48    let m = m as f64;
49    let n = n as f64;
50    let k_num = (m / n * std::f64::consts::LN_2).round() as u32;
51    std::cmp::max(k_num, 1)
52}
53
54impl BloomFilter {
55    pub fn new(item_count: usize, p: f64) -> rand::Result<Self> {
56        let size = calc_filter_size(item_count, p);
57        let k_num = optimal_k_num(size as u64, item_count);
58
59        let s1 = rand::u64()?;
60        let s2 = rand::u64()?;
61        let s3 = rand::u64()?;
62        let s4 = rand::u64()?;
63        let sip1 = SipHasher::new_with_keys(s1, s2);
64        let sip2 = SipHasher::new_with_keys(s3, s4);
65
66        Ok(Self {
67            bits: BitVector::new(size),
68            k_num,
69            sips: [sip1, sip2],
70            seed: [s1, s2, s3, s4],
71        })
72    }
73
74    pub fn set(&mut self, item: &impl Hash) {
75        let mut hashes = [0; 2];
76        for k_i in 0..self.k_num {
77            let bit_offset = self.bloom_hash(&mut hashes, item, k_i) as usize % self.bits.len();
78            self.bits.set(bit_offset, true);
79        }
80    }
81
82    pub fn check(&self, item: &impl Hash) -> bool {
83        let mut hashes = [0u64, 0u64];
84        for k_i in 0..self.k_num {
85            let bit_offset = self.bloom_hash(&mut hashes, item, k_i) as usize % self.bits.len();
86            if !self.bits.get(bit_offset) {
87                return false;
88            }
89        }
90        true
91    }
92
93    pub fn check_and_set(&mut self, item: &impl Hash) -> bool {
94        let mut hashes = [0u64, 0u64];
95        let mut found = true;
96        for k_i in 0..self.k_num {
97            let bit_offset = self.bloom_hash(&mut hashes, item, k_i) as usize % self.bits.len();
98            if !self.bits.get(bit_offset) {
99                found = false;
100                self.bits.set(bit_offset, true);
101            }
102        }
103        found
104    }
105
106    fn bloom_hash(&self, hashes: &mut [u64; 2], item: &impl Hash, k_i: u32) -> u64 {
107        if k_i > 1 {
108            return (hashes[0]).wrapping_add((k_i as u64).wrapping_mul(hashes[1]))
109                % 0xFFFF_FFFF_FFFF_FFC5u64;
110            //largest u64 prime
111        }
112        let sip = &mut self.sips[k_i as usize].clone();
113        item.hash(sip);
114        let hash = sip.finish();
115        hashes[k_i as usize] = hash;
116        hash
117    }
118}
119
120impl Encode for BloomFilter {
121    #[async_impl]
122    async fn encode_async(
123        &self,
124        writer: &mut impl __utils_macros::AsyncEncoder,
125    ) -> crate::Result<()> {
126        self.bits.encode_async(writer).await?;
127        self.k_num.encode_async(writer).await?;
128        self.seed.encode_async(writer).await?;
129        Ok(())
130    }
131
132    fn encode_size(&self) -> usize {
133        self.bits.encode_size() + self.k_num.encode_size() + self.seed.encode_size()
134    }
135}
136
137impl Decode for BloomFilter {
138    #[async_impl]
139    async fn decode_async(reader: &mut impl __utils_macros::AsyncDecoder) -> crate::Result<Self> {
140        let bits = BitVector::decode_async(reader).await?;
141        let k_num = u32::decode_async(reader).await?;
142        let seed = <[u64; 4]>::decode_async(reader).await?;
143        let sip1 = SipHasher::new_with_keys(seed[0], seed[1]);
144        let sip2 = SipHasher::new_with_keys(seed[2], seed[3]);
145        Ok(Self {
146            bits,
147            k_num,
148            sips: [sip1, sip2],
149            seed,
150        })
151    }
152}
153
154#[cfg(test)]
155mod tests {
156    use crate::bytes::encoding::{decode_from_bytes, encode_to_bytes};
157
158    use super::*;
159
160    #[test]
161    fn test_bloom_basic() {
162        let i0 = "test";
163        let i1 = "test1";
164        let i2 = "test2";
165
166        let mut bloom = BloomFilter::new(99, 0.001).unwrap();
167        assert_eq!(bloom.k_num, 10);
168        assert_eq!(bloom.bits.len(), 1423);
169
170        assert!(!bloom.check(&i0));
171        assert!(!bloom.check(&i1));
172        assert!(!bloom.check(&i2));
173
174        bloom.set(&i0);
175        assert!(bloom.check(&i0));
176        assert!(!bloom.check(&i1));
177        assert!(!bloom.check(&i2));
178
179        bloom.set(&i1);
180        assert!(bloom.check(&i0));
181        assert!(bloom.check(&i1));
182        assert!(!bloom.check(&i2));
183
184        bloom.set(&i2);
185        assert!(bloom.check(&i0));
186        assert!(bloom.check(&i1));
187        assert!(bloom.check(&i2));
188    }
189
190    #[test]
191    fn test_bloom_encoding() {
192        let i0 = 123;
193        let i1 = 456;
194        let i2 = 789;
195
196        let mut bloom = BloomFilter::new(9999, 0.0001).unwrap();
197        assert_eq!(bloom.k_num, 13);
198        assert_eq!(bloom.bits.len(), 191681);
199
200        assert!(!bloom.check(&i0));
201        assert!(!bloom.check(&i1));
202        assert!(!bloom.check(&i2));
203
204        bloom.set(&i0);
205        assert!(bloom.check(&i0));
206        assert!(!bloom.check(&i1));
207        assert!(!bloom.check(&i2));
208
209        let encoded = encode_to_bytes(&bloom).unwrap();
210        let mut bloom: BloomFilter = decode_from_bytes(encoded).unwrap();
211
212        assert!(bloom.check(&i0));
213        assert!(!bloom.check(&i1));
214        assert!(!bloom.check(&i2));
215
216        bloom.set(&i1);
217        assert!(bloom.check(&i0));
218        assert!(bloom.check(&i1));
219        assert!(!bloom.check(&i2));
220    }
221}