rustolio_utils/
bloom_filter.rs1#![allow(deprecated)]
12
13use std::hash::{Hash, Hasher as _, SipHasher};
14
15use crate::{bit_vec::BitVector, crypto::rand, prelude::*};
16
17#[derive(Debug, Clone)]
18pub struct BloomFilter {
19 bits: BitVector,
20 k_num: u32,
21 sips: [SipHasher; 2],
22 seed: [u64; 4],
23}
24
25impl PartialEq for BloomFilter {
26 fn eq(&self, other: &Self) -> bool {
27 self.bits == other.bits && self.k_num == other.k_num && self.seed == other.seed
28 }
29}
30impl Eq for BloomFilter {}
31
32fn calc_filter_size(n: usize, p: f64) -> usize {
37 assert!(n > 0);
38 assert!(p > 0.0 && p < 1.0);
39 let log2_2 = std::f64::consts::LN_2 * std::f64::consts::LN_2;
40 -((n as f64) * f64::ln(p) / log2_2).ceil() as usize
41}
42
43fn optimal_k_num(m: u64, n: usize) -> u32 {
48 let m = m as f64;
49 let n = n as f64;
50 let k_num = (m / n * std::f64::consts::LN_2).round() as u32;
51 std::cmp::max(k_num, 1)
52}
53
54impl BloomFilter {
55 pub fn new(item_count: usize, p: f64) -> rand::Result<Self> {
56 let size = calc_filter_size(item_count, p);
57 let k_num = optimal_k_num(size as u64, item_count);
58
59 let s1 = rand::u64()?;
60 let s2 = rand::u64()?;
61 let s3 = rand::u64()?;
62 let s4 = rand::u64()?;
63 let sip1 = SipHasher::new_with_keys(s1, s2);
64 let sip2 = SipHasher::new_with_keys(s3, s4);
65
66 Ok(Self {
67 bits: BitVector::new(size),
68 k_num,
69 sips: [sip1, sip2],
70 seed: [s1, s2, s3, s4],
71 })
72 }
73
74 pub fn set(&mut self, item: &impl Hash) {
75 let mut hashes = [0; 2];
76 for k_i in 0..self.k_num {
77 let bit_offset = self.bloom_hash(&mut hashes, item, k_i) as usize % self.bits.len();
78 self.bits.set(bit_offset, true);
79 }
80 }
81
82 pub fn check(&self, item: &impl Hash) -> bool {
83 let mut hashes = [0u64, 0u64];
84 for k_i in 0..self.k_num {
85 let bit_offset = self.bloom_hash(&mut hashes, item, k_i) as usize % self.bits.len();
86 if !self.bits.get(bit_offset) {
87 return false;
88 }
89 }
90 true
91 }
92
93 pub fn check_and_set(&mut self, item: &impl Hash) -> bool {
94 let mut hashes = [0u64, 0u64];
95 let mut found = true;
96 for k_i in 0..self.k_num {
97 let bit_offset = self.bloom_hash(&mut hashes, item, k_i) as usize % self.bits.len();
98 if !self.bits.get(bit_offset) {
99 found = false;
100 self.bits.set(bit_offset, true);
101 }
102 }
103 found
104 }
105
106 fn bloom_hash(&self, hashes: &mut [u64; 2], item: &impl Hash, k_i: u32) -> u64 {
107 if k_i > 1 {
108 return (hashes[0]).wrapping_add((k_i as u64).wrapping_mul(hashes[1]))
109 % 0xFFFF_FFFF_FFFF_FFC5u64;
110 }
112 let sip = &mut self.sips[k_i as usize].clone();
113 item.hash(sip);
114 let hash = sip.finish();
115 hashes[k_i as usize] = hash;
116 hash
117 }
118}
119
120impl Encode for BloomFilter {
121 #[async_impl]
122 async fn encode_async(
123 &self,
124 writer: &mut impl __utils_macros::AsyncEncoder,
125 ) -> crate::Result<()> {
126 self.bits.encode_async(writer).await?;
127 self.k_num.encode_async(writer).await?;
128 self.seed.encode_async(writer).await?;
129 Ok(())
130 }
131
132 fn encode_size(&self) -> usize {
133 self.bits.encode_size() + self.k_num.encode_size() + self.seed.encode_size()
134 }
135}
136
137impl Decode for BloomFilter {
138 #[async_impl]
139 async fn decode_async(reader: &mut impl __utils_macros::AsyncDecoder) -> crate::Result<Self> {
140 let bits = BitVector::decode_async(reader).await?;
141 let k_num = u32::decode_async(reader).await?;
142 let seed = <[u64; 4]>::decode_async(reader).await?;
143 let sip1 = SipHasher::new_with_keys(seed[0], seed[1]);
144 let sip2 = SipHasher::new_with_keys(seed[2], seed[3]);
145 Ok(Self {
146 bits,
147 k_num,
148 sips: [sip1, sip2],
149 seed,
150 })
151 }
152}
153
154#[cfg(test)]
155mod tests {
156 use crate::bytes::encoding::{decode_from_bytes, encode_to_bytes};
157
158 use super::*;
159
160 #[test]
161 fn test_bloom_basic() {
162 let i0 = "test";
163 let i1 = "test1";
164 let i2 = "test2";
165
166 let mut bloom = BloomFilter::new(99, 0.001).unwrap();
167 assert_eq!(bloom.k_num, 10);
168 assert_eq!(bloom.bits.len(), 1423);
169
170 assert!(!bloom.check(&i0));
171 assert!(!bloom.check(&i1));
172 assert!(!bloom.check(&i2));
173
174 bloom.set(&i0);
175 assert!(bloom.check(&i0));
176 assert!(!bloom.check(&i1));
177 assert!(!bloom.check(&i2));
178
179 bloom.set(&i1);
180 assert!(bloom.check(&i0));
181 assert!(bloom.check(&i1));
182 assert!(!bloom.check(&i2));
183
184 bloom.set(&i2);
185 assert!(bloom.check(&i0));
186 assert!(bloom.check(&i1));
187 assert!(bloom.check(&i2));
188 }
189
190 #[test]
191 fn test_bloom_encoding() {
192 let i0 = 123;
193 let i1 = 456;
194 let i2 = 789;
195
196 let mut bloom = BloomFilter::new(9999, 0.0001).unwrap();
197 assert_eq!(bloom.k_num, 13);
198 assert_eq!(bloom.bits.len(), 191681);
199
200 assert!(!bloom.check(&i0));
201 assert!(!bloom.check(&i1));
202 assert!(!bloom.check(&i2));
203
204 bloom.set(&i0);
205 assert!(bloom.check(&i0));
206 assert!(!bloom.check(&i1));
207 assert!(!bloom.check(&i2));
208
209 let encoded = encode_to_bytes(&bloom).unwrap();
210 let mut bloom: BloomFilter = decode_from_bytes(encoded).unwrap();
211
212 assert!(bloom.check(&i0));
213 assert!(!bloom.check(&i1));
214 assert!(!bloom.check(&i2));
215
216 bloom.set(&i1);
217 assert!(bloom.check(&i0));
218 assert!(bloom.check(&i1));
219 assert!(!bloom.check(&i2));
220 }
221}