commonware_cryptography/bloomfilter/
mod.rs1#[cfg(all(test, feature = "arbitrary"))]
4mod conformance;
5
6use crate::{sha256::Sha256, Hasher};
7use bytes::{Buf, BufMut};
8use commonware_codec::{
9 codec::{Read, Write},
10 error::Error as CodecError,
11 EncodeSize, FixedSize,
12};
13use commonware_utils::bitmap::BitMap;
14use core::{
15 marker::PhantomData,
16 num::{NonZeroU64, NonZeroU8, NonZeroUsize},
17};
18#[cfg(feature = "std")]
19use {
20 commonware_utils::rational::BigRationalExt,
21 num_rational::BigRational,
22 num_traits::{One, ToPrimitive, Zero},
23};
24
25#[cfg(feature = "std")]
27const LN2: (u64, u64) = (14397, 20769);
28
29#[cfg(feature = "std")]
31const LN2_INV: (u64, u64) = (29145, 20201);
32
33#[derive(Clone, Debug)]
60pub struct BloomFilter<H: Hasher = Sha256> {
61 hashers: u8,
62 bits: BitMap,
63 _marker: PhantomData<H>,
64}
65
66impl<H: Hasher> PartialEq for BloomFilter<H> {
67 fn eq(&self, other: &Self) -> bool {
68 self.hashers == other.hashers && self.bits == other.bits
69 }
70}
71
72impl<H: Hasher> Eq for BloomFilter<H> {}
73
74impl<H: Hasher> BloomFilter<H> {
75 const _ASSERT_DIGEST_AT_LEAST_16_BYTES: () = assert!(
77 <H::Digest as FixedSize>::SIZE >= 16,
78 "digest must be at least 128 bits (16 bytes)"
79 );
80
81 pub fn new(hashers: NonZeroU8, bits: NonZeroUsize) -> Self {
86 let bits = bits
87 .get()
88 .checked_next_power_of_two()
89 .unwrap_or(1 << (usize::BITS - 1));
90 Self {
91 hashers: hashers.get(),
92 bits: BitMap::zeroes(bits as u64),
93 _marker: PhantomData,
94 }
95 }
96
97 #[cfg(feature = "std")]
111 pub fn with_rate(expected_items: NonZeroUsize, fp_rate: BigRational) -> Self {
112 let bits = Self::optimal_bits(expected_items.get(), &fp_rate);
113 let hashers = Self::optimal_hashers(expected_items.get(), bits);
114 Self {
115 hashers,
116 bits: BitMap::zeroes(bits as u64),
117 _marker: PhantomData,
118 }
119 }
120
121 pub const fn hashers(&self) -> NonZeroU8 {
123 NonZeroU8::new(self.hashers).expect("hashers is never zero")
124 }
125
126 pub const fn bits(&self) -> NonZeroUsize {
128 NonZeroUsize::new(self.bits.len() as usize).expect("bits is never zero")
129 }
130
131 fn indices(&self, item: &[u8]) -> impl Iterator<Item = u64> {
133 #[allow(path_statements)]
134 Self::_ASSERT_DIGEST_AT_LEAST_16_BYTES;
135
136 let digest = H::hash(item);
138 let h1 = u64::from_be_bytes(digest[0..8].try_into().unwrap());
139 let mut h2 = u64::from_be_bytes(digest[8..16].try_into().unwrap());
140
141 h2 |= 1;
144
145 let hashers = self.hashers as u64;
149 let mask = self.bits.len() - 1;
150 (0..hashers).map(move |hasher| h1.wrapping_add(hasher.wrapping_mul(h2)) & mask)
151 }
152
153 pub fn insert(&mut self, item: &[u8]) {
155 let indices = self.indices(item);
156 for index in indices {
157 self.bits.set(index, true);
158 }
159 }
160
161 pub fn contains(&self, item: &[u8]) -> bool {
165 let indices = self.indices(item);
166 for index in indices {
167 if !self.bits.get(index) {
168 return false;
169 }
170 }
171 true
172 }
173
174 #[cfg(feature = "std")]
181 pub fn estimated_false_positive_rate(&self) -> BigRational {
182 let ones = self.bits.count_ones();
183 let len = self.bits.len();
184 let fill_ratio = BigRational::new(ones.into(), len.into());
185 fill_ratio.pow(self.hashers as i32)
186 }
187
188 #[cfg(feature = "std")]
195 pub fn estimated_count(&self) -> BigRational {
196 let m = self.bits.len();
197 let x = self.bits.count_ones();
198 let k = self.hashers as u64;
199 if x >= m {
200 return BigRational::from_usize(usize::MAX);
201 }
202
203 let one_minus_fill = BigRational::new((m - x).into(), m.into());
205 let log2_val = one_minus_fill.log2_floor(16);
206 let ln2 = BigRational::from_frac_u64(LN2.0, LN2.1);
207 let ln_result = &log2_val * &ln2;
208
209 let m_over_k = BigRational::new(m.into(), k.into());
211 -m_over_k * ln_result
212 }
213
214 #[cfg(feature = "std")]
219 pub fn optimal_hashers(expected_items: usize, bits: usize) -> u8 {
220 if expected_items == 0 {
221 return 1;
222 }
223
224 let ln2 = BigRational::from_frac_u64(LN2.0, LN2.1);
226 let k_ratio = BigRational::from_usize(bits) * ln2 / BigRational::from_usize(expected_items);
227 k_ratio.to_integer().to_u8().unwrap_or(16).clamp(1, 16)
228 }
229
230 #[cfg(feature = "std")]
242 pub fn optimal_bits(expected_items: usize, fp_rate: &BigRational) -> usize {
243 assert!(
244 fp_rate > &BigRational::zero() && fp_rate < &BigRational::one(),
245 "false positive rate must be in (0, 1)"
246 );
247
248 let log2_p = fp_rate.log2_floor(16);
251
252 let n = BigRational::from_usize(expected_items);
255 let ln2_inv = BigRational::from_frac_u64(LN2_INV.0, LN2_INV.1);
256 let bits_rational = -(&n * &log2_p * &ln2_inv);
257
258 let raw = bits_rational.ceil_to_u128().unwrap_or(1) as usize;
259 raw.max(1)
260 .checked_next_power_of_two()
261 .unwrap_or(1 << (usize::BITS - 1))
262 }
263}
264
265impl<H: Hasher> Write for BloomFilter<H> {
266 fn write(&self, buf: &mut impl BufMut) {
267 self.hashers.write(buf);
268 self.bits.write(buf);
269 }
270}
271
272impl<H: Hasher> Read for BloomFilter<H> {
273 type Cfg = (NonZeroU8, NonZeroU64);
275
276 fn read_cfg(
277 buf: &mut impl Buf,
278 (hashers_cfg, bits_cfg): &Self::Cfg,
279 ) -> Result<Self, CodecError> {
280 if !bits_cfg.get().is_power_of_two() {
281 return Err(CodecError::Invalid(
282 "BloomFilter",
283 "bits must be a power of 2",
284 ));
285 }
286 let hashers = u8::read_cfg(buf, &())?;
287 if hashers != hashers_cfg.get() {
288 return Err(CodecError::Invalid(
289 "BloomFilter",
290 "hashers doesn't match config",
291 ));
292 }
293 let bits = BitMap::read_cfg(buf, &bits_cfg.get())?;
294 if bits.len() != bits_cfg.get() {
295 return Err(CodecError::Invalid(
296 "BloomFilter",
297 "bitmap length doesn't match config",
298 ));
299 }
300 Ok(Self {
301 hashers,
302 bits,
303 _marker: PhantomData,
304 })
305 }
306}
307
308impl<H: Hasher> EncodeSize for BloomFilter<H> {
309 fn encode_size(&self) -> usize {
310 self.hashers.encode_size() + self.bits.encode_size()
311 }
312}
313
314#[cfg(feature = "arbitrary")]
315impl<H: Hasher> arbitrary::Arbitrary<'_> for BloomFilter<H> {
316 fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result<Self> {
317 let hashers = u8::arbitrary(u)?.max(1);
319 let bits_len = u.int_in_range(0..=u16::MAX as u64)?.next_power_of_two();
321 let mut bits = BitMap::with_capacity(bits_len);
322 for _ in 0..bits_len {
323 bits.push(u.arbitrary::<bool>()?);
324 }
325 Ok(Self {
326 hashers,
327 bits,
328 _marker: PhantomData,
329 })
330 }
331}
332
333#[cfg(test)]
334mod tests {
335 use super::*;
336 use commonware_codec::{Decode, Encode};
337 use commonware_utils::{NZUsize, NZU64, NZU8};
338
339 #[test]
340 fn test_insert_and_contains() {
341 let mut bf = BloomFilter::<Sha256>::new(NZU8!(10), NZUsize!(1000));
342 let item1 = b"hello";
343 let item2 = b"world";
344 let item3 = b"bloomfilter";
345
346 bf.insert(item1);
347 bf.insert(item2);
348
349 assert!(bf.contains(item1));
350 assert!(bf.contains(item2));
351 assert!(!bf.contains(item3));
352 }
353
354 #[test]
355 fn test_empty() {
356 let bf = BloomFilter::<Sha256>::new(NZU8!(5), NZUsize!(100));
357 assert!(!bf.contains(b"anything"));
358 }
359
360 #[test]
361 fn test_false_positives() {
362 let mut bf = BloomFilter::<Sha256>::new(NZU8!(10), NZUsize!(100));
363 for i in 0..10usize {
364 bf.insert(&i.to_be_bytes());
365 }
366
367 for i in 0..10usize {
369 assert!(bf.contains(&i.to_be_bytes()));
370 }
371
372 let mut false_positives = 0;
374 for i in 100..1100usize {
375 if bf.contains(&i.to_be_bytes()) {
376 false_positives += 1;
377 }
378 }
379
380 assert!(false_positives > 0);
383 assert!(false_positives < 1000);
384 }
385
386 #[test]
387 fn test_codec_roundtrip() {
388 let mut bf = BloomFilter::<Sha256>::new(NZU8!(5), NZUsize!(128));
389 bf.insert(b"test1");
390 bf.insert(b"test2");
391
392 let cfg = (NZU8!(5), NZU64!(128));
393
394 let encoded = bf.encode();
395 let decoded = BloomFilter::<Sha256>::decode_cfg(encoded, &cfg).unwrap();
396
397 assert_eq!(bf, decoded);
398 }
399
400 #[test]
401 fn test_codec_empty() {
402 let bf = BloomFilter::<Sha256>::new(NZU8!(4), NZUsize!(128));
403 let cfg = (NZU8!(4), NZU64!(128));
404 let encoded = bf.encode();
405 let decoded = BloomFilter::<Sha256>::decode_cfg(encoded, &cfg).unwrap();
406 assert_eq!(bf, decoded);
407 }
408
409 #[test]
410 fn test_codec_with_invalid_hashers() {
411 let mut bf = BloomFilter::<Sha256>::new(NZU8!(5), NZUsize!(128));
412 bf.insert(b"test1");
413 let encoded = bf.encode();
414
415 let cfg = (NZU8!(10), NZU64!(128));
417 let decoded = BloomFilter::<Sha256>::decode_cfg(encoded.clone(), &cfg);
418 assert!(matches!(
419 decoded,
420 Err(CodecError::Invalid(
421 "BloomFilter",
422 "hashers doesn't match config"
423 ))
424 ));
425
426 let cfg = (NZU8!(4), NZU64!(128));
428 let decoded = BloomFilter::<Sha256>::decode_cfg(encoded, &cfg);
429 assert!(matches!(
430 decoded,
431 Err(CodecError::Invalid(
432 "BloomFilter",
433 "hashers doesn't match config"
434 ))
435 ));
436 }
437
438 #[test]
439 fn test_codec_with_invalid_bits() {
440 let mut bf = BloomFilter::<Sha256>::new(NZU8!(5), NZUsize!(128));
441 bf.insert(b"test1");
442 let encoded = bf.encode();
443
444 let cfg = (NZU8!(5), NZU64!(64));
446 let result = BloomFilter::<Sha256>::decode_cfg(encoded.clone(), &cfg);
447 assert!(matches!(result, Err(CodecError::InvalidLength(128))));
448
449 let cfg = (NZU8!(5), NZU64!(256));
450 let result = BloomFilter::<Sha256>::decode_cfg(encoded.clone(), &cfg);
451 assert!(matches!(
452 result,
453 Err(CodecError::Invalid(
454 "BloomFilter",
455 "bitmap length doesn't match config"
456 ))
457 ));
458
459 let cfg = (NZU8!(5), NZU64!(100));
461 let result = BloomFilter::<Sha256>::decode_cfg(encoded, &cfg);
462 assert!(matches!(
463 result,
464 Err(CodecError::Invalid(
465 "BloomFilter",
466 "bits must be a power of 2"
467 ))
468 ));
469 }
470
471 #[test]
472 fn test_statistics() {
473 let mut bf = BloomFilter::<Sha256>::new(NZU8!(7), NZUsize!(1024));
474
475 assert_eq!(bf.estimated_count(), BigRational::zero());
477 assert_eq!(bf.estimated_false_positive_rate(), BigRational::zero());
478
479 for i in 0..100usize {
481 bf.insert(&i.to_be_bytes());
482 }
483
484 let estimated = bf.estimated_count();
486 let lower = BigRational::from_usize(75);
487 let upper = BigRational::from_usize(125);
488 assert!(estimated > lower && estimated < upper);
489
490 assert!(bf.estimated_false_positive_rate() > BigRational::zero());
492 assert!(bf.estimated_false_positive_rate() < BigRational::one());
493 }
494
495 #[test]
496 fn test_with_rate() {
497 let fp_rate = BigRational::from_frac_u64(1, 100);
499 let mut bf = BloomFilter::<Sha256>::with_rate(NZUsize!(1000), fp_rate.clone());
500
501 let expected_bits = BloomFilter::<Sha256>::optimal_bits(1000, &fp_rate);
503 let expected_hashers = BloomFilter::<Sha256>::optimal_hashers(1000, expected_bits);
504 assert_eq!(bf.bits().get(), expected_bits);
505 assert_eq!(bf.hashers().get(), expected_hashers);
506
507 for i in 0..1000usize {
509 bf.insert(&i.to_be_bytes());
510 }
511
512 for i in 0..1000usize {
514 assert!(bf.contains(&i.to_be_bytes()));
515 }
516
517 let mut false_positives = 0;
519 for i in 1000..2000usize {
520 if bf.contains(&i.to_be_bytes()) {
521 false_positives += 1;
522 }
523 }
524
525 assert!(false_positives < 20);
528 }
529
530 #[test]
531 fn test_optimal_hashers() {
532 let k = BloomFilter::<Sha256>::optimal_hashers(1000, 10000);
535 assert_eq!(k, 6);
536
537 let k = BloomFilter::<Sha256>::optimal_hashers(100, 1000);
540 assert_eq!(k, 6);
541
542 let k = BloomFilter::<Sha256>::optimal_hashers(1000, 100);
544 assert_eq!(k, 1);
545
546 let k = BloomFilter::<Sha256>::optimal_hashers(100, 100000);
548 assert_eq!(k, 16);
549
550 let k = BloomFilter::<Sha256>::optimal_hashers(0, 1000);
552 assert_eq!(k, 1);
553
554 let k = BloomFilter::<Sha256>::optimal_hashers(1 << 48, 1000);
557 assert_eq!(k, 1);
558 let k = BloomFilter::<Sha256>::optimal_hashers(usize::MAX, usize::MAX);
559 assert!((1..=16).contains(&k));
560 }
561
562 #[test]
563 fn test_optimal_bits() {
564 let fp_1pct = BigRational::from_frac_u64(1, 100);
568 let bits = BloomFilter::<Sha256>::optimal_bits(1000, &fp_1pct);
569 assert_eq!(bits, 16384);
570 assert!(bits.is_power_of_two());
571
572 let fp_001pct = BigRational::from_frac_u64(1, 100_000);
576 let bits_lower_fp = BloomFilter::<Sha256>::optimal_bits(10000, &fp_001pct);
577 assert_eq!(bits_lower_fp, 262144);
578 assert!(bits_lower_fp.is_power_of_two());
579 }
580
581 #[test]
582 fn test_bits_extreme_values() {
583 let fp_001pct = BigRational::from_frac_u64(1, 10_000);
584 let fp_1pct = BigRational::from_frac_u64(1, 100);
585
586 let bits = BloomFilter::<Sha256>::optimal_bits(usize::MAX / 2, &fp_001pct);
588 assert!(bits.is_power_of_two());
589 assert!(bits > 0);
590
591 let bits = BloomFilter::<Sha256>::optimal_bits(1_000_000_000, &fp_001pct);
593 assert!(bits.is_power_of_two());
594
595 let bits = BloomFilter::<Sha256>::optimal_bits(0, &fp_1pct);
597 assert!(bits.is_power_of_two());
598 assert_eq!(bits, 1); }
600
601 #[test]
602 fn test_with_rate_deterministic() {
603 let fp_rate = BigRational::from_frac_u64(1, 100);
604 let bf1 = BloomFilter::<Sha256>::with_rate(NZUsize!(1000), fp_rate.clone());
605 let bf2 = BloomFilter::<Sha256>::with_rate(NZUsize!(1000), fp_rate);
606 assert_eq!(bf1.bits(), bf2.bits());
607 assert_eq!(bf1.hashers(), bf2.hashers());
608 }
609
610 #[test]
611 fn test_optimal_bits_matches_formula() {
612 let fp_rate = BigRational::from_frac_u64(1, 100);
616 let bits = BloomFilter::<Sha256>::optimal_bits(1000, &fp_rate);
617 assert_eq!(bits, 16384);
618 }
619}