fastbloom_rs/
bloom.rs

1use std::cmp::min;
2use std::fs::{File, OpenOptions};
3use std::fs;
4use std::io::{Write, Read};
5use std::ptr::slice_from_raw_parts;
6
7use xxhash_rust::xxh3::xxh3_64_with_seed;
8
9use crate::{Deletable, Hashes, Membership};
10use crate::builder::FilterBuilder;
11use crate::vec::{BloomBitVec, CountingVec};
12
13#[inline]
14fn bit_set(bit_set: &mut BloomBitVec, value: &[u8], m: u64, k: u64) {
15    // let len = m >> 5;
16    // let hash1 = (murmur3_x64_128(value, 0) % m) as u64;
17    // let hash2 = (murmur3_x64_128(value, 32) % m) as u64;
18    let hash1 = xxh3_64_with_seed(value, 0) % m;
19    let hash2 = xxh3_64_with_seed(value, 32) % m;
20
21    let m = m as u64;
22    for i in 1..k {
23        let mo = ((hash1 + i * hash2) % m) as usize;
24        bit_set.set(mo);
25    };
26    bit_set.set(hash1 as usize);
27}
28
29#[inline]
30fn bit_check(bit_set: &BloomBitVec, value: &[u8], m: u64, k: u64) -> bool {
31    // let hash1 = (murmur3_x64_128(value, 0) % m) as u64;
32    // let hash2 = (murmur3_x64_128(value, 32) % m) as u64;
33    let hash1 = xxh3_64_with_seed(value, 0) % m;
34    let hash2 = xxh3_64_with_seed(value, 32) % m;
35    let mut res = bit_set.get(hash1 as usize);
36    if !res { return false; }
37    // let m = m as u64;
38    for i in 1..k {
39        let mo = ((hash1 + i * hash2) % m) as usize;
40        res = res && bit_set.get(mo);
41        if !res { return false; }
42    }
43    res
44}
45
46#[inline]
47fn bit_check_and_set(bit_set: &mut BloomBitVec, value: &[u8], m: u64, k: u64) -> bool {
48    // let hash1 = (murmur3_x64_128(value, 0) % m) as u64;
49    // let hash2 = (murmur3_x64_128(value, 32) % m) as u64;
50    let hash1 = xxh3_64_with_seed(value, 0) % m;
51    let hash2 = xxh3_64_with_seed(value, 32) % m;
52    let mut res = bit_set.get(hash1 as usize);
53    bit_set.set(hash1 as usize);
54    // let m = m as u64;
55    for i in 1..k {
56        let mo = ((hash1 + i * hash2) % m) as usize;
57        res = res && bit_set.get(mo);
58        bit_set.set(mo);
59    }
60    res
61}
62
63#[inline]
64fn get_bit_indices(bit_set: &BloomBitVec, value: &[u8], m: u64, k: u64) -> Vec<u64> {
65    let mut res = Vec::<u64>::with_capacity(k as usize);
66    // let hash1 = (murmur3_x64_128(value, 0) % m) as u64;
67    // let hash2 = (murmur3_x64_128(value, 32) % m) as u64;
68    let hash1 = xxh3_64_with_seed(value, 0) % m;
69    let hash2 = xxh3_64_with_seed(value, 32) % m;
70    res.push(hash1);
71    // let m = m as u64;
72    for i in 1..k {
73        let mo = ((hash1 + i * hash2) % m) as usize;
74        res.push(mo as u64);
75    }
76    res
77}
78
79/// A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard
80/// Bloom in 1970, that is used to test whether an element is a member of a set. False positive
81/// matches are possible, but false negatives are not.
82///
83/// **Reference**: Bloom, B. H. (1970). Space/time trade-offs in hash coding with allowable errors.
84/// Communications of the ACM, 13(7), 422-426.
85/// [Full text article](http://crystal.uta.edu/~mcguigan/cse6350/papers/Bloom.pdf)
86#[derive(Clone)]
87#[derive(Debug)]
88#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
89pub struct BloomFilter {
90    config: FilterBuilder,
91    bit_set: BloomBitVec,
92}
93
94impl Membership for BloomFilter {
95    /// Adds the passed value to the filter.
96    fn add(&mut self, element: &[u8]) {
97        bit_set(&mut self.bit_set, element, self.config.size,
98                self.config.hashes as u64);
99    }
100
101    /// Tests whether an element is present in the filter (subject to the specified false
102    /// positive rate).
103    #[inline]
104    fn contains(&self, element: &[u8]) -> bool {
105        bit_check(&self.bit_set, element, self.config.size,
106                  self.config.hashes as u64)
107    }
108
109    /// Get the hashes indices of the element in the filter.
110    fn get_hash_indices(&self, element: &[u8]) -> Vec<u64> {
111        get_bit_indices(&self.bit_set, element, self.config.size,
112                        self.config.hashes as u64)
113    }
114
115    /// Tests whether a hashes indices is present in the filter
116    fn contains_hash_indices(&self, indices: &Vec<u64>) -> bool {
117        for x in indices.iter() {
118            let index = *x;
119            if !self.bit_set.get(index as usize) { return false; }
120        }
121        true
122    }
123
124    /// Removes all elements from the filter (i.e. resets all bits to zero).
125    fn clear(&mut self) {
126        self.bit_set.clear();
127    }
128}
129
130impl Hashes for BloomFilter {
131    ///  Returns the hash function number of the Bloom filter.
132    fn hashes(&self) -> u32 {
133        self.config.hashes
134    }
135}
136
137impl BloomFilter {
138    /// Build a Bloom filter form [FilterBuilder].
139    ///
140    /// # Examples:
141    ///
142    /// ```rust
143    /// use fastbloom_rs::{BloomFilter, FilterBuilder};
144    ///
145    /// let builder = FilterBuilder::new(100_000_000, 0.01);
146    /// let bloom = BloomFilter::new(builder);
147    /// ```
148    pub fn new(mut config: FilterBuilder) -> Self {
149        config.complete();
150        #[cfg(target_pointer_width = "64")]
151            let bit_set = BloomBitVec::new((config.size >> 6) as usize);
152        #[cfg(target_pointer_width = "32")]
153            let bit_set = BloomBitVec::new((config.size >> 5) as usize);
154        BloomFilter { config, bit_set }
155    }
156
157    /// Tests whether an element is present in the filter (subject to the specified false
158    /// positive rate). And if it is not in this filter, add it to the filter.
159    #[inline]
160    pub fn add_if_not_contains(&mut self, element: &[u8]) -> bool {
161        bit_check_and_set(&mut self.bit_set, element, self.config.size,
162                          self.config.hashes as u64)
163    }
164
165    /// Build a Bloom filter from file with first four bytes is hashes which is encode by big-endian.
166    /// The remaining is underlying byte vector of the Bloom filter.
167    pub fn from_file_with_hashes(path: &str) -> Self {
168        let mut f = File::open(path).unwrap();
169        let len = f.metadata().unwrap().len() - 4;
170        let mut hash = [0; 4];
171        f.read_exact(&mut hash).unwrap();
172        let hashes = u32::from_be_bytes(hash);
173
174        let mut config =
175            FilterBuilder::from_size_and_hashes((len * 8) as u64, hashes);
176        config.complete();
177
178        let bit_set = BloomBitVec::from_file(&mut f, 4, len);
179        
180        BloomFilter { config, bit_set }
181    }
182
183    /// Build a Bloom filter from file. The content is underlying byte vector of the Bloom filter.
184    pub fn from_file(path: &str, hashes: u32) -> Self {
185        let mut f = File::open(path).unwrap();
186        let len = f.metadata().unwrap().len();
187        let mut config =
188            FilterBuilder::from_size_and_hashes((len * 8) as u64, hashes);
189        config.complete();
190
191        let bit_set = BloomBitVec::from_file(&mut f, 0, len);
192        
193        BloomFilter { config, bit_set }
194    }
195
196    /// Build a Bloom filter form `&[u8]`.
197    ///
198    /// # Examples
199    ///
200    /// ```rust
201    /// use fastbloom_rs::BloomFilter;
202    /// let mut array = vec![0u8; 4096];
203    /// let bloom = BloomFilter::from_u8_array(array.as_bytes(), 4);
204    /// ```
205    pub fn from_u8_array(array: &[u8], hashes: u32) -> Self {
206        let mut config =
207            FilterBuilder::from_size_and_hashes((array.len() * 8) as u64, hashes);
208        config.complete();
209        #[cfg(target_pointer_width = "64")]
210            let mut bit_vec = BloomBitVec::new((config.size >> 6) as usize);
211        #[cfg(target_pointer_width = "32")]
212            let mut bit_vec = BloomBitVec::new((config.size >> 5) as usize);
213
214        let ptr = array.as_ptr() as *const usize;
215        #[cfg(target_pointer_width = "64")]
216            let usize_array = slice_from_raw_parts(ptr, (config.size >> 6) as usize);
217        #[cfg(target_pointer_width = "32")]
218            let usize_array = slice_from_raw_parts(ptr, (config.size >> 5) as usize);
219
220        bit_vec.storage.copy_from_slice(unsafe { &*usize_array });
221
222        BloomFilter { config, bit_set: bit_vec }
223    }
224
225    /// Build a Bloom filter form `&[u16]`.
226    ///
227    /// # Examples
228    ///
229    /// ```rust
230    /// use fastbloom_rs::BloomFilter;
231    /// let mut array = vec![0u16; 2048];
232    /// let bloom = BloomFilter::from_u16_array(array.as_bytes(), 4);
233    /// ```
234    pub fn from_u16_array(array: &[u16], hashes: u32) -> Self {
235        let mut config =
236            FilterBuilder::from_size_and_hashes((array.len() * 16) as u64, hashes);
237        config.complete();
238        #[cfg(target_pointer_width = "64")]
239            let mut bit_vec = BloomBitVec::new((config.size >> 6) as usize);
240        #[cfg(target_pointer_width = "32")]
241            let mut bit_vec = BloomBitVec::new((config.size >> 5) as usize);
242
243        let ptr = array.as_ptr() as *const usize;
244        #[cfg(target_pointer_width = "64")]
245            let usize_array = slice_from_raw_parts(ptr, (config.size >> 6) as usize);
246        #[cfg(target_pointer_width = "32")]
247            let usize_array = slice_from_raw_parts(ptr, (config.size >> 5) as usize);
248
249        bit_vec.storage.copy_from_slice(unsafe { &*usize_array });
250
251        BloomFilter { config, bit_set: bit_vec }
252    }
253
254
255    /// Build a Bloom filter form `&[u32]`.
256    ///
257    /// # Examples
258    ///
259    /// ```rust
260    /// use fastbloom_rs::BloomFilter;
261    /// let mut array = vec![0u32; 1024];
262    /// let bloom = BloomFilter::from_u32_array(array.as_bytes(), 4);
263    /// ```
264    pub fn from_u32_array(array: &[u32], hashes: u32) -> Self {
265        let mut config =
266            FilterBuilder::from_size_and_hashes((array.len() * 32) as u64, hashes);
267        config.complete();
268        #[cfg(target_pointer_width = "64")]
269            let mut bit_vec = BloomBitVec::new((config.size >> 6) as usize);
270        #[cfg(target_pointer_width = "32")]
271            let mut bit_vec = BloomBitVec::new((config.size >> 5) as usize);
272
273        let ptr = array.as_ptr() as *const usize;
274        #[cfg(target_pointer_width = "64")]
275            let usize_array = slice_from_raw_parts(ptr, (config.size >> 6) as usize);
276        #[cfg(target_pointer_width = "32")]
277            let usize_array = slice_from_raw_parts(ptr, (config.size >> 5) as usize);
278
279        bit_vec.storage.copy_from_slice(unsafe { &*usize_array });
280
281        BloomFilter { config, bit_set: bit_vec }
282    }
283
284    /// Build a Bloom filter form `&[u64]`.
285    ///
286    /// # Examples
287    ///
288    /// ```rust
289    /// use fastbloom_rs::BloomFilter;
290    /// let mut array = vec![0u64; 512];
291    /// let bloom = BloomFilter::from_u32_array(array.as_bytes(), 4);
292    /// ```
293    pub fn from_u64_array(array: &[u64], hashes: u32) -> Self {
294        let mut config =
295            FilterBuilder::from_size_and_hashes((array.len() * 64) as u64, hashes);
296        config.complete();
297        #[cfg(target_pointer_width = "64")]
298            let mut bit_vec = BloomBitVec::new((config.size >> 6) as usize);
299        #[cfg(target_pointer_width = "32")]
300            let mut bit_vec = BloomBitVec::new((config.size >> 5) as usize);
301
302        let ptr = array.as_ptr() as *const usize;
303        #[cfg(target_pointer_width = "64")]
304            let usize_array = slice_from_raw_parts(ptr, (config.size >> 6) as usize);
305        #[cfg(target_pointer_width = "32")]
306            let usize_array = slice_from_raw_parts(ptr, (config.size >> 5) as usize);
307
308        bit_vec.storage.copy_from_slice(unsafe { &*usize_array });
309
310        BloomFilter { config, bit_set: bit_vec }
311    }
312
313    /// Returns the configuration/builder of the Bloom filter.
314    /// # Examples
315    ///
316    /// ```rust
317    /// use fastbloom_rs::{BloomFilter, FilterBuilder};
318    ///
319    /// let bloom = FilterBuilder::new(100_000_000, 0.01).build_bloom_filter();
320    /// let builder = bloom.config();
321    /// ```
322    ///
323    pub fn config(&self) -> FilterBuilder {
324        self.config.clone()
325    }
326
327    /// Save the bloom filter to file, and the first four bytes is hashes with 
328    /// big-endian, and the remaining bytes is underlying byte vector of the Bloom filter.
329    pub fn save_to_file_with_hashes(&mut self, path: &str) {
330        let mut file = File::create(path).unwrap();
331        let hash = self.hashes().to_be_bytes();
332        file.write_all(&hash).unwrap();
333
334        let bytes = self.get_u8_array();
335        let mut file = OpenOptions::new().append(true).open(path).unwrap();
336        file.write_all(bytes).unwrap();
337    }
338
339    /// Save the bloom filter to file, and the content of the file is underlying byte 
340    /// vector of the Bloom filter.
341    pub fn save_to_file(&mut self, path: &str) {
342        let mut file = File::create(path).unwrap();
343        let bytes = self.get_u8_array();
344        file.write_all(bytes).unwrap();
345    }
346
347    /// Return the underlying byte vector of the Bloom filter.
348    pub fn get_u8_array(&self) -> &[u8] {
349        let storage = &self.bit_set.storage;
350        let ptr = storage.as_ptr();
351        let u8_ptr = ptr as *const u8;
352        #[cfg(target_pointer_width = "64")]
353            let ptr = slice_from_raw_parts(u8_ptr, storage.len() * 8);
354        #[cfg(target_pointer_width = "32")]
355            let ptr = slice_from_raw_parts(u8_ptr, storage.len() * 4);
356        unsafe { &*ptr }
357    }
358
359    /// Return the underlying u16 vector of the Bloom filter.
360    pub fn get_u16_array(&self) -> &[u16] {
361        let storage = &self.bit_set.storage;
362        let ptr = storage.as_ptr() as *const u16;
363        #[cfg(target_pointer_width = "64")]
364            let ptr = slice_from_raw_parts(ptr, storage.len() * 4);
365        #[cfg(target_pointer_width = "32")]
366            let ptr = slice_from_raw_parts(ptr, storage.len() * 2);
367        unsafe { &*ptr }
368    }
369
370    /// Return the underlying u32 vector of the Bloom filter.
371    pub fn get_u32_array(&self) -> &[u32] {
372        let storage = &self.bit_set.storage;
373        let ptr = storage.as_ptr() as *const u32;
374        #[cfg(target_pointer_width = "64")]
375            let ptr = slice_from_raw_parts(ptr, storage.len() * 2);
376        #[cfg(target_pointer_width = "32")]
377            let ptr = slice_from_raw_parts(ptr, storage.len());
378        unsafe { &*ptr }
379    }
380
381    /// Return the underlying u64 vector of the Bloom filter.
382    pub fn get_u64_array(&self) -> &[u64] {
383        let storage = &self.bit_set.storage;
384        let ptr = storage.as_ptr() as *const u64;
385        #[cfg(target_pointer_width = "64")]
386            let ptr = slice_from_raw_parts(ptr, storage.len());
387        if cfg!(target_pointer_width= "32") {
388            if storage.len() % 2 != 0 {
389                panic!("BloomBitVec with len {} can't export as u64 array!", storage.len())
390            }
391        }
392        #[cfg(target_pointer_width = "32")]
393            let ptr = slice_from_raw_parts(ptr, storage.len() / 2usize);
394
395        unsafe { &*ptr }
396    }
397
398
399    /// Performs the union operation on two compatible bloom filters. This is achieved through a
400    /// bitwise OR operation on their bit vectors. This operations is lossless, i.e. no elements
401    /// are lost and the bloom filter is the same that would have resulted if all elements wer
402    /// directly inserted in just one bloom filter.
403    pub fn union(&mut self, other: &BloomFilter) -> bool {
404        if self.compatible(other) {
405            self.bit_set.or(&other.bit_set);
406            true
407        } else { false }
408    }
409
410    /// Performs the intersection operation on two compatible bloom filters. This is achieved
411    /// through a bitwise AND operation on their bit vectors. The operations doesn't introduce
412    /// any false negatives but it does raise the false positive probability. The the false
413    /// positive probability in the resulting Bloom filter is at most the false-positive probability
414    /// in one of the constituent bloom filters
415    pub fn intersect(&mut self, other: &BloomFilter) -> bool {
416        if self.compatible(other) {
417            self.bit_set.and(&other.bit_set);
418            true
419        } else { false }
420    }
421
422    /// Returns [true] if the Bloom filter does not contain any elements
423    pub fn is_empty(&self) -> bool {
424        self.bit_set.is_empty()
425    }
426
427    /// Returns estimated cardinality of the set
428    /// see [Scalable and Efficient Privacy Preserving Global Itemset Support Approximation Using Bloom Filters](https://inria.hal.science/hal-01284874/document) as reference
429    pub fn estimate_set_cardinality(&self) -> f64 {
430        (self.bit_set.count_zeros() as f64 / self.config.size as f64).ln() / (self.hashes() as f64 * (1.0 - 1.0/self.config.size as f64).ln())
431    }
432
433    pub(crate) fn set_bit_vec(&mut self, bit_vec: BloomBitVec) {
434        assert_eq!(self.config.size, bit_vec.nbits as u64);
435        self.bit_set = bit_vec
436    }
437
438    /// Checks if two Bloom filters are compatible, i.e. have compatible parameters (hash function,
439    /// size, etc.)
440    fn compatible(&self, other: &BloomFilter) -> bool {
441        self.config.is_compatible_to(&other.config)
442    }
443}
444
445/// A Counting Bloom filter works in a similar manner as a regular Bloom filter; however, it is
446/// able to keep track of insertions and deletions. In a counting Bloom filter, each entry in the
447/// Bloom filter is a small counter associated with a basic Bloom filter bit.
448///
449/// **Reference**: F. Bonomi, M. Mitzenmacher, R. Panigrahy, S. Singh, and G. Varghese, “An Improved
450/// Construction for Counting Bloom Filters,” in 14th Annual European Symposium on
451/// Algorithms, LNCS 4168, 2006
452#[derive(Clone)]
453#[derive(Debug)]
454pub struct CountingBloomFilter {
455    config: FilterBuilder,
456    counting_vec: CountingVec,
457}
458
459macro_rules! get_array {
460    ($name:ident, $native:ty, $len:expr) => {
461        impl CountingBloomFilter {
462            pub fn $name(&self) -> &[$native] {
463                let ptr = self.counting_vec.storage.as_ptr() as *const $native;
464                #[cfg(target_pointer_width = "64")]
465                    let arr = slice_from_raw_parts(ptr, self.counting_vec.storage.len() * $len);
466                #[cfg(target_pointer_width = "32")]
467                    if cfg!(target_pointer_width= "32") {
468                        if self.counting_vec.storage.len() % 2 != 0 {
469                            panic!("CountingVec with len {} can't export as u64 array!", self.counting_vec.storage.len())
470                        }
471                    }
472                #[cfg(target_pointer_width = "32")]
473                    let arr = slice_from_raw_parts(ptr, self.counting_vec.storage.len() * $len / 2);
474                unsafe { &*arr }
475            }
476        }
477    };
478}
479
480get_array!(get_u8_array, u8, 8);
481get_array!(get_u16_array, u16, 4);
482get_array!(get_u32_array, u32, 2);
483get_array!(get_u64_array, u64, 1);
484
485impl CountingBloomFilter {
486    pub fn new(mut config: FilterBuilder) -> Self {
487        config.complete();
488        #[cfg(target_pointer_width = "64")]
489            let counting_vec = CountingVec::new((config.size >> 4) as usize);
490        #[cfg(target_pointer_width = "32")]
491            let counting_vec = CountingVec::new((config.size >> 3) as usize);
492        CountingBloomFilter { config, counting_vec }
493    }
494
495    pub(crate) fn set_counting_vec(&mut self, counting_vec: CountingVec) {
496        assert_eq!(self.config.size, counting_vec.counters as u64);
497        self.counting_vec = counting_vec
498    }
499
500    /// Checks if two Counting Bloom filters are compatible, i.e. have compatible parameters (hash
501    /// function, size, etc.)
502    fn compatible(&self, other: &BloomFilter) -> bool {
503        self.config.is_compatible_to(&other.config)
504    }
505
506    /// Returns the configuration/builder of the Bloom filter.
507    /// # Examples
508    ///
509    /// ```rust
510    /// use fastbloom_rs::{BloomFilter, FilterBuilder};
511    ///
512    /// let bloom = FilterBuilder::new(100_000_000, 0.01).build_bloom_filter();
513    /// let builder = bloom.config();
514    /// ```
515    ///
516    pub fn config(&self) -> FilterBuilder {
517        self.config.clone()
518    }
519}
520
521macro_rules! from_array {
522    ($name:ident, $native:ty, $num:expr) => {
523        impl CountingBloomFilter {
524            pub fn $name(array: &[$native], hashes: u32, enable_repeat_insert:bool) -> Self {
525                let mut config =
526                    FilterBuilder::from_size_and_hashes((array.len() * $num) as u64, hashes);
527                config.enable_repeat_insert(enable_repeat_insert);
528                config.complete();
529                #[cfg(target_pointer_width = "64")]
530                    let mut counting_vec = CountingVec::new((config.size >> 4) as usize);
531                #[cfg(target_pointer_width = "32")]
532                    let mut counting_vec = CountingVec::new((config.size >> 3) as usize);
533
534                let ptr = array.as_ptr() as *const usize;
535                #[cfg(target_pointer_width = "64")]
536                    let usize_array = slice_from_raw_parts(ptr, (config.size >> 4) as usize);
537                #[cfg(target_pointer_width = "32")]
538                    let usize_array = slice_from_raw_parts(ptr, (config.size >> 3) as usize);
539
540                counting_vec.storage.copy_from_slice(unsafe { &*usize_array });
541
542                CountingBloomFilter { config, counting_vec }
543            }
544        }
545    };
546}
547
548from_array!(from_u8_array, u8, 2);
549from_array!(from_u16_array, u16, 4);
550from_array!(from_u32_array, u32, 8);
551from_array!(from_u64_array, u64, 16);
552
553impl CountingBloomFilter {
554    /// Get the estimate count for element in this counting bloom filter.
555    /// See: https://github.com/yankun1992/fastbloom/issues/3
556    pub fn estimate_count(&self, element: &[u8]) -> usize {
557        let m = self.config.size;
558        let hash1 = xxh3_64_with_seed(element, 0) % m;
559        let hash2 = xxh3_64_with_seed(element, 32) % m;
560
561        let mut res = self.counting_vec.get(hash1 as usize);
562        if res == 0 { return 0; }
563
564        for i in 1..self.config.hashes as u64 {
565            let mo = ((hash1 + i * hash2) % m) as usize;
566            let count = self.counting_vec.get(mo);
567            if count == 0 { return 0; } else { res = min(count, res) }
568        }
569
570        res
571    }
572
573    /// Get the underlying counter at index.
574    pub fn counter_at(&self, index: u64) -> usize {
575        self.counting_vec.get(index as usize)
576    }
577}
578
579impl Membership for CountingBloomFilter {
580    fn add(&mut self, element: &[u8]) {
581        let m = self.config.size;
582        // let hash1 = (murmur3_x64_128(element, 0) % m) as u64;
583        // let hash2 = (murmur3_x64_128(element, 32) % m) as u64;
584        let hash1 = xxh3_64_with_seed(element, 0) % m;
585        let hash2 = xxh3_64_with_seed(element, 32) % m;
586
587        let mut res = self.counting_vec.get(hash1 as usize) > 0;
588        // let m = self.config.size;
589        for i in 1..self.config.hashes as u64 {
590            let mo = ((hash1 + i * hash2) % m) as usize;
591            res = res && (self.counting_vec.get(mo) > 0);
592        }
593
594        // contains and not enable repeat insert
595        if res && !self.config.enable_repeat_insert {
596            return;
597        }
598
599        // insert
600        for i in 1..self.config.hashes as u64 {
601            let mo = ((hash1 + i * hash2) % m) as usize;
602            self.counting_vec.increment(mo);
603        };
604        self.counting_vec.increment(hash1 as usize);
605    }
606
607    #[inline]
608    fn contains(&self, element: &[u8]) -> bool {
609        let m = self.config.size;
610        // let hash1 = (murmur3_x64_128(element, 0) % m) as u64;
611        // let hash2 = (murmur3_x64_128(element, 32) % m) as u64;
612        let hash1 = xxh3_64_with_seed(element, 0) % m;
613        let hash2 = xxh3_64_with_seed(element, 32) % m;
614
615        let mut res = self.counting_vec.get(hash1 as usize) > 0;
616        if !res { return false; }
617        // let m = self.config.size;
618        for i in 1..self.config.hashes as u64 {
619            let mo = ((hash1 + i * hash2) % m) as usize;
620            res = res && (self.counting_vec.get(mo) > 0);
621            if !res { return false; }
622        }
623        res
624    }
625
626    fn get_hash_indices(&self, element: &[u8]) -> Vec<u64> {
627        let m = self.config.size;
628        let mut res = Vec::<u64>::with_capacity(self.config.size as usize);
629        // let hash1 = (murmur3_x64_128(element, 0) % m) as u64;
630        // let hash2 = (murmur3_x64_128(element, 32) % m) as u64;
631        let hash1 = xxh3_64_with_seed(element, 0) % m;
632        let hash2 = xxh3_64_with_seed(element, 32) % m;
633        res.push(hash1);
634        // let m = self.config.size;
635        for i in 1..self.config.hashes as u64 {
636            let mo = ((hash1 + i * hash2) % m) as usize;
637            res.push(mo as u64);
638        }
639        res
640    }
641
642    fn contains_hash_indices(&self, indices: &Vec<u64>) -> bool {
643        for x in indices.iter() {
644            let index = *x;
645            if self.counting_vec.get(index as usize) == 0 { return false; }
646        }
647        true
648    }
649
650    fn clear(&mut self) {
651        self.counting_vec.clear()
652    }
653}
654
655impl Deletable for CountingBloomFilter {
656    fn remove(&mut self, element: &[u8]) {
657        let m = self.config.size;
658        // let hash1 = (murmur3_x64_128(element, 0) % m) as u64;
659        // let hash2 = (murmur3_x64_128(element, 32) % m) as u64;
660        let hash1 = xxh3_64_with_seed(element, 0) % m;
661        let hash2 = xxh3_64_with_seed(element, 32) % m;
662
663        let mut res = self.counting_vec.get(hash1 as usize) > 0;
664        // let m = self.config.size;
665        for i in 1..self.config.hashes as u64 {
666            let mo = ((hash1 + i * hash2) % m) as usize;
667            res = res && (self.counting_vec.get(mo) > 0);
668        }
669
670        // contains
671        if res {
672            for i in 1..self.config.hashes as u64 {
673                let mo = ((hash1 + i * hash2) % m) as usize;
674                self.counting_vec.decrement(mo);
675            };
676            self.counting_vec.decrement(hash1 as usize);
677        }
678    }
679}
680
681impl Hashes for CountingBloomFilter {
682    fn hashes(&self) -> u32 {
683        self.config.hashes
684    }
685}
686
687/// A Partitioned Bloom Filter is a variation of a classic Bloom Filter.
688///
689/// This filter works by partitioning the M-sized bit array into k slices of size `m = M/k` bits,
690/// `k = nb of hash functions` in the filter. Each hash function produces an index over `m` for its
691/// respective slice. Thus, each element is described by exactly `k` bits, meaning the distribution
692/// of false positives is uniform across all elements.
693///
694/// Be careful, as a Partitioned Bloom Filter have much higher collison risks that a classic
695/// Bloom Filter on small sets of data.
696///
697/// **Reference**: Chang, F., Feng, W. C., & Li, K. (2004, March). Approximate caches for packet
698/// classification. In INFOCOM 2004. Twenty-third AnnualJoint Conference of the IEEE Computer and
699/// Communications Societies (Vol. 4, pp. 2196-2207). IEEE.
700/// [Full text article](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.153.6902&rep=rep1&type=pdf)
701#[derive(Clone)]
702#[derive(Debug)]
703pub(crate) struct PartitionedBloomFilter {}
704
705impl PartitionedBloomFilter {}
706
707/// A Scalable Bloom Filter is a variant of Bloom Filters that can adapt dynamically to the number
708/// of elements stored, while assuring a maximum false positive probability.
709///
710/// **Reference**: ALMEIDA, Paulo Sérgio, BAQUERO, Carlos, PREGUIÇA, Nuno, et al. Scalable bloom
711/// filters. Information Processing Letters, 2007, vol. 101, no 6, p. 255-261.
712/// [Full text article](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.725.390&rep=rep1&type=pdf)
713#[derive(Clone)]
714#[derive(Debug)]
715pub(crate) struct ScalableBloomFilter {}
716
717impl ScalableBloomFilter {}
718
719/// An Invertible Bloom Filters (IBLT), also called Invertible Bloom Lookup Table, is a
720/// space-efficient and probabilistic data-structure for solving the set-difference problem
721/// efficiently without the use of logs or other prior context. It computes the set difference
722/// with communication proportional to the size of the difference between the sets being compared.
723/// They can simultaneously calculate D(A−B) and D(B−A) using O(d) space. This data structure
724/// encodes sets in a fashion that is similar in spirit to Tornado codes’ construction, in that it
725/// randomly combines elements using the XOR function.
726///
727/// **Reference**: Eppstein, D., Goodrich, M. T., Uyeda, F., & Varghese, G. (2011). What's the
728/// difference?: efficient set reconciliation without prior context. ACM SIGCOMM Computer
729/// Communication Review, 41(4), 218-229.
730/// [Full text article](http://www.sysnet.ucsd.edu/sysnet/miscpapers/EppGooUye-SIGCOMM-11.pdf)
731#[derive(Clone)]
732#[derive(Debug)]
733pub(crate) struct InvertibleBloomFilter {}
734
735impl InvertibleBloomFilter {}
736
737#[derive(Clone)]
738#[derive(Debug)]
739pub(crate) struct GarbledBloomFilter {}
740
741impl GarbledBloomFilter {}
742
743
744#[test]
745fn bloom_test() {
746    let mut builder =
747        FilterBuilder::new(10_000_000, 0.01);
748    let mut bloom = builder.build_bloom_filter();
749    println!("{:?}", bloom.config);
750    bloom.add(b"hello");
751    println!("{:?}", &bloom.bit_set.storage[0..300]);
752    assert_eq!(bloom.contains(b"hello"), true);
753    assert_eq!(bloom.contains(b"world"), false);
754    assert_eq!(bloom.add_if_not_contains(b"hello2"), false);
755    assert_eq!(bloom.contains(b"hello2"), true);
756
757    let storage = &bloom.bit_set.storage[0..300];
758    println!("{:?}", storage);
759
760    #[cfg(target_pointer_width = "64")]{
761        let mut bloom2 = BloomFilter::from_u64_array(bloom.get_u64_array(), bloom.hashes());
762        assert_eq!(bloom2.compatible(&bloom), true);
763        assert_eq!(bloom2.contains(b"hello"), true);
764        assert_eq!(bloom2.contains(b"world"), false);
765    }
766
767    let mut bloom3 =
768        BloomFilter::from_u32_array(bloom.get_u32_array(), bloom.config.hashes);
769    assert_eq!(bloom3.compatible(&bloom), true);
770    assert_eq!(bloom3.contains(b"hello"), true);
771    assert_eq!(bloom3.contains(b"world"), false);
772
773    let u8_array = bloom.get_u8_array();
774    let mut bloom4 = BloomFilter::from_u8_array(u8_array, bloom.config.hashes);
775    println!("{:?}", &bloom4.bit_set.storage[0..300]);
776    assert_eq!(bloom4.compatible(&bloom), true);
777    assert_eq!(bloom4.contains(b"hello"), true);
778    assert_eq!(bloom4.contains(b"world"), false);
779
780    let bloom5 = BloomFilter::from_u16_array(bloom.get_u16_array(), bloom.hashes());
781    assert_eq!(bloom5.compatible(&bloom), true);
782    assert_eq!(bloom5.contains(b"hello"), true);
783    assert_eq!(bloom5.contains(b"world"), false);
784
785    bloom4.add(b"hello world");
786
787    assert_eq!(bloom.intersect(&bloom4), true);
788    assert_eq!(bloom.contains(b"hello"), true);
789    assert_eq!(bloom.contains(b"hello world"), false);
790
791    bloom3.add(b"hello world");
792    bloom3.add(b"hello yankun");
793
794    assert_eq!(bloom3.union(&bloom4), true);
795    assert_eq!(bloom3.contains(b"hello"), true);
796    assert_eq!(bloom3.contains(b"hello world"), true);
797    assert_eq!(bloom3.contains(b"hello yankun"), true);
798}
799
800#[test]
801fn bloom_hash_indices_test() {
802    let mut builder =
803        FilterBuilder::new(10_000, 0.01);
804    let mut bloom = builder.build_bloom_filter();
805    println!("{:?}", bloom.config);
806    bloom.add(b"hello");
807    assert_eq!(bloom.contains(b"hello"), true);
808    assert_eq!(bloom.contains(b"world"), false);
809
810    let indices = bloom.get_hash_indices(b"hello");
811    println!("{:?}", indices);
812    assert_eq!(bloom.contains_hash_indices(&indices), true);
813    assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"world")), false);
814}
815
816#[test] 
817fn bloom_large() {
818    let mut builder =
819        FilterBuilder::new(1_000_000_000, 0.0001);
820    let mut bloom = builder.build_bloom_filter();
821
822    bloom.add(b"hello");
823    assert_eq!(bloom.contains(b"hello"), true);
824
825    let bloom = BloomFilter::from_u8_array(bloom.get_u8_array(), bloom.hashes());
826
827    assert_eq!(bloom.contains(b"hello"), true);
828
829}
830
831#[test]
832fn bloom_save_and_load_file_hashes() {
833    {
834        let mut builder = FilterBuilder::new(1_000_000_000, 0.0001);
835        let mut bloom = builder.build_bloom_filter();
836
837        bloom.add(b"hello");
838        assert_eq!(bloom.contains(b"hello"), true);
839        bloom.save_to_file_with_hashes("hello.bloom");
840    }
841    
842
843    let bloom2 = BloomFilter::from_file_with_hashes("hello.bloom");
844    fs::remove_file("hello.bloom").unwrap();
845
846    assert_eq!(bloom2.contains(b"hello"), true);
847    assert_eq!(bloom2.contains(b"world"), false);
848
849}
850
851#[test]
852fn bloom_save_and_load_file() {
853    let mut hashes = 0;
854    {
855        let mut builder = FilterBuilder::new(1_000_000_000, 0.0001);
856        let mut bloom = builder.build_bloom_filter();
857
858        bloom.add(b"hello");
859        assert_eq!(bloom.contains(b"hello"), true);
860
861        hashes = bloom.hashes();
862        
863        bloom.save_to_file("no_hashes.bloom");
864    }
865
866    let bloom2 = BloomFilter::from_file("no_hashes.bloom", hashes);
867    fs::remove_file("no_hashes.bloom").unwrap();
868
869    assert_eq!(bloom2.contains(b"hello"), true);
870    assert_eq!(bloom2.contains(b"world"), false);
871    
872}
873
874
875#[test]
876fn counting_bloom_test() {
877    let mut builder =
878        FilterBuilder::new(10_000, 0.01);
879    let mut bloom = builder.build_counting_bloom_filter();
880
881    bloom.add(b"hello");
882
883    assert_eq!(bloom.contains(b"hello"), true);
884
885    bloom.remove(b"hello");
886    assert_eq!(bloom.contains(b"hello"), false);
887}
888
889#[test]
890fn counting_bloom_repeat_test() {
891    let mut builder = FilterBuilder::new(100_000, 0.01);
892    // enable_repeat_insert is true
893    builder.enable_repeat_insert(true);
894    let mut cbf = builder.build_counting_bloom_filter();
895    cbf.add(b"hello"); // modify underlying vector counter.
896    cbf.add(b"hello"); // modify underlying vector counter.
897    assert_eq!(cbf.contains(b"hello"), true);
898    cbf.remove(b"hello");
899    assert_eq!(cbf.contains(b"hello"), true);
900    cbf.remove(b"hello");
901    assert_eq!(cbf.contains(b"hello"), false);
902
903    // enable_repeat_insert is false
904    builder.enable_repeat_insert(false);
905    let mut cbf = builder.build_counting_bloom_filter();
906    cbf.add(b"hello"); // modify underlying vector counter.
907    cbf.add(b"hello"); // not modify underlying vector counter because b"hello" has been added.
908    assert_eq!(cbf.contains(b"hello"), true);
909    cbf.remove(b"hello");
910    assert_eq!(cbf.contains(b"hello"), false);
911}
912
913#[test]
914fn counting_bloom_from_test() {
915    let mut builder = FilterBuilder::new(10_000_000, 0.01);
916    let mut cbf = builder.build_counting_bloom_filter();
917
918    cbf.add(b"hello");
919    cbf.add(b"hello");
920
921    let mut cbf_copy = CountingBloomFilter::from_u8_array(cbf.get_u8_array(), builder.hashes, true);
922    assert_eq!(cbf_copy.contains(b"hello"), true);
923    cbf_copy.remove(b"hello");
924    assert_eq!(cbf_copy.contains(b"hello"), true);
925    cbf_copy.remove(b"hello");
926    assert_eq!(cbf_copy.contains(b"hello"), false);
927
928    let mut cbf_copy = CountingBloomFilter::from_u16_array(cbf.get_u16_array(), builder.hashes, true);
929    assert_eq!(cbf_copy.contains(b"hello"), true);
930    cbf_copy.remove(b"hello");
931    assert_eq!(cbf_copy.contains(b"hello"), true);
932    cbf_copy.remove(b"hello");
933    assert_eq!(cbf_copy.contains(b"hello"), false);
934
935    let mut cbf_copy = CountingBloomFilter::from_u32_array(cbf.get_u32_array(), builder.hashes, true);
936    assert_eq!(cbf_copy.contains(b"hello"), true);
937    cbf_copy.remove(b"hello");
938    assert_eq!(cbf_copy.contains(b"hello"), true);
939    cbf_copy.remove(b"hello");
940    assert_eq!(cbf_copy.contains(b"hello"), false);
941
942    #[cfg(target_pointer_width = "64")]{
943        let mut cbf_copy = CountingBloomFilter::from_u64_array(cbf.get_u64_array(), builder.hashes, true);
944        assert_eq!(cbf_copy.contains(b"hello"), true);
945        cbf_copy.remove(b"hello");
946        assert_eq!(cbf_copy.contains(b"hello"), true);
947        cbf_copy.remove(b"hello");
948        assert_eq!(cbf_copy.contains(b"hello"), false);
949    }
950}
951
952#[test]
953fn counting_bloom_hash_indices_test() {
954    let mut builder =
955        FilterBuilder::new(10_000, 0.01);
956    let mut bloom = builder.build_counting_bloom_filter();
957
958    bloom.add(b"hello");
959
960    assert_eq!(bloom.contains(b"hello"), true);
961    assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"hello")), true);
962    assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"world")), false);
963
964
965    bloom.remove(b"hello");
966    assert_eq!(bloom.contains(b"hello"), false);
967    assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"hello")), false);
968}
969
970#[test]
971fn counting_bloom_estimate_count() {
972    let mut builder =
973        FilterBuilder::new(10_000, 0.01);
974    let mut bloom = builder.build_counting_bloom_filter();
975
976    bloom.add(b"hello");
977    bloom.add(b"world");
978
979    assert_eq!(bloom.estimate_count(b"hello"), 1);
980    let indices = bloom.get_hash_indices(b"hello");
981
982    for index in indices {
983        assert_eq!(bloom.counter_at(index), 1)
984    }
985
986    assert_eq!(bloom.estimate_count(b"world"), 1);
987    for index in bloom.get_hash_indices(b"world") {
988        assert!(bloom.counter_at(index) <= 2);
989    }
990}