memory_cache_rust/bloom/
bbloom.rs

1//!A Bloom filter is a space-efficient probabilistic data structure,
2//! conceived by Burton Howard Bloom in 1970,
3//! that is used to test whether an element is a member of a set.
4//! False positive matches are possible, but false negatives are not – in other words,
5//! a query returns either "possibly in set" or "definitely not in set".
6//! Elements can be added to the set, but not removed (though this can be addressed with the counting Bloom filter variant);
7//! the more items added, the larger the probability of false positives.
8
9
10
11
12use serde::{Deserialize, Serialize};
13
14const MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
15
16pub struct Bloom {
17    bitset: Vec<i64>,
18    elem_num: u64,
19    size_exp: u64,
20    size: u64,
21    set_locs: u64,
22    shift: u64,
23}
24
25fn calc_size_by_wrong_positives(num_entries: f64, wrongs: f64) -> (u64, u64) {
26
27    let size = -1.0 * num_entries * wrongs.ln() / 0.69314718056_f64.powf(2.0);
28    let locs = (0.69314718056_f64 * size / num_entries).ceil() ;
29    return (size as u64, locs as u64);
30}
31
32impl Bloom {
33    ///  returns a new bloom filter.
34    pub fn new(num_entries: f64, wrongs: f64) -> Self {
35        let mut entries = 0;
36        let mut locs = 0;
37        if wrongs < 1.0 {
38            let (e, l) = calc_size_by_wrong_positives(num_entries, wrongs);
39            entries = e;
40            locs = l;
41        } else {
42            entries = num_entries as u64;
43            locs = wrongs as u64;
44        }
45
46        let (size, exponent) = getSize(entries);
47        let mut b = Bloom {
48            bitset: vec![],
49            elem_num: 0,
50            size_exp: exponent,
51            size: size - 1,
52            set_locs: locs,
53            shift: 64 - exponent,
54        };
55        b.size(size);
56        b
57    }
58    /// <--- http://www.cse.yorku.ca/~oz/hash.html
59    /// modified Berkeley DB Hash (32bit)
60    /// hash is casted to l, h = 16bit fragments
61    /// func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
62    /// 	hash := uint64(len(*b))
63    /// 	for _, c := range *b {
64    /// 		hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
65    /// 	}
66    /// 	h = hash >> bl.shift
67    ///    l = hash << bl.shift >> bl.shift
68    /// 	return l, h
69    /// }
70    pub fn add(&mut self, hash: u64) {
71        let h = hash >> self.shift;
72        let l = hash << self.shift >> self.shift;
73
74        for i in 0..self.set_locs {
75            self.set((h + (i * l)) & self.size);
76            self.elem_num += 1;
77        };
78    }
79    /// AddIfNotHas only Adds hash, if it's not present in the bloomfilter.
80    /// Returns true if hash was added.
81    /// Returns false if hash was already registered in the bloomfilter.
82    pub fn add_if_not_has(&mut self, hash: u64) -> bool {
83        if self.has(hash) {
84            return false;
85        }
86        self.add(hash);
87        true
88    }
89    /// Clear resets the Bloom filter.
90    pub fn clear(&mut self) {
91        self.bitset = vec![0; self.bitset.len()]
92    }
93    /// Set sets the bit[idx] of bitset.
94    pub fn set(&mut self, idx: u64) {
95        // let b = *self.bitset[(idx >> 6) as usize];
96
97        // let ptr:*mut [i64] =  self.bitset as *mut [i64];
98        let mut ptr: *mut i64 = self.bitset.as_mut_ptr();
99        unsafe {
100            let step = idx >> 6;//((idx >> 6) + ((idx % 64) >> 3));
101            ptr = ptr.wrapping_offset(step as isize);
102
103            *ptr |= MASK[(idx % 8) as usize] as i64;
104        };
105
106    }
107    /// Size makes Bloom filter with as bitset of size sz.
108    pub fn size(&mut self, sz: u64) {
109        self.bitset = Vec::with_capacity((sz >> 6) as usize); // vec![0i64; (sz >> 6) as usize]
110        for i in 0..(sz >> 6) as usize {
111            self.bitset.insert(i, 0)
112        }
113    }
114    /// Has checks if bit(s) for entry hash is/are set,
115    /// returns true if the hash was added to the Bloom Filter.
116    pub fn has(&mut self, hash: u64) -> bool {
117        let h = hash >> self.shift;
118        let l = hash << self.shift >> self.shift;
119        for i in 0..self.set_locs {
120            if !self.isset((h + (i * l)) & self.size) {
121                return false;
122            }
123        }
124
125        true
126    }
127    /// IsSet checks if bit[idx] of bitset is set, returns true/false.
128    pub fn isset(&mut self, idx: u64) -> bool {
129        let mut ptr: *mut i64 = self.bitset.as_mut_ptr();
130        // if ((idx >> 6) + ((idx % 64) >> 3)) as usize > self.bitset.len() {
131        //     return false;
132        // }
133        unsafe {
134            let step = idx >> 6 /*+ ((idx % 64) >> 3))*/;
135            ptr = ptr.wrapping_offset(step as isize);
136        }
137
138        let r = unsafe { (*ptr >> (idx % 8)) & 1 };
139        r == 1
140    }
141    /*  fn json_decode(&mut self, dbData: &[u8]) -> Self {
142          let data = serde_json::from_slice::<BloomJsonExport>(dbData);
143          i
144      }*/
145    fn json_encoder(&mut self) -> Vec<u8> {
146        let mut bj = BloomJsonExport {
147            set_locs: self.set_locs,
148            filter_set: vec![0u8; (self.bitset.len() << 3) as usize],
149        };
150
151        for i in 0..bj.filter_set.len() {
152            let ptr: *mut i64 = self.bitset.as_mut_ptr();
153            bj.filter_set[i] = unsafe { ptr.wrapping_offset(i as isize) as u8 }
154        }
155        let data = serde_json::to_vec(&bj);
156        if let Ok(result) = data {
157            return result;
158        }
159        vec![]
160    }
161}
162
163#[derive(Serialize, Deserialize, Debug)]
164pub struct BloomJsonExport {
165    filter_set: Vec<u8>,
166    set_locs: u64,
167}
168
169
170fn getSize(mut u_i64: u64) -> (u64, u64) {
171    if u_i64 < 512 {
172        u_i64 = 512;
173    }
174    let mut exponent = 0;
175    let mut size = 1;
176    while size < u_i64 {
177        size <<= 1;
178        exponent += 1;
179    }
180    return (size, exponent);
181}
182
183
184#[cfg(test)]
185mod tests {
186    use std::collections::HashSet;
187
188    use uuid::Uuid;
189
190    use crate::bloom::rutil::mem_hash;
191
192    use super::*;
193
194    const N: usize = 1 << 16;
195
196
197    fn worldlist() -> Vec<Vec<u8>> {
198        let _seed = [0u8; 16];
199        // let mut rng: StdRng = SeedableRng::from_seed(seed);
200
201        let mut wordlist = Vec::with_capacity(N);
202        for _i in 0..wordlist.capacity() {
203            // let mut bytes = [0u8; 32];
204            let uuid = Uuid::new_v4();
205            // rng.fill_bytes(&mut bytes);
206            // let v = rand::thread_rng().gen::<[u8; 32]>();
207
208            wordlist.push(Vec::from(uuid.as_bytes().map(|v| v)));
209        }
210        wordlist
211    }
212
213    #[test]
214    fn test_number_of_wrong() {
215        let mut bf = Bloom::new((N * 10) as f64, 7.0);
216        let mut cnt = 0;
217        let word_list = worldlist();
218        let mut set = HashSet::new();
219        // bf.add_if_not_has(1147594788350054766);
220        for i in 0..word_list.len() {
221            let hash = mem_hash(&*word_list[i]);
222            set.insert(hash);
223            if !bf.add_if_not_has(hash.into()) {
224                cnt += 1;
225            }
226        }
227
228        assert_eq!(set.len(), word_list.len());
229
230        println!("Bloomfilter New(7* 2**16, 7) \
231            (-> size={} bit): \n    \
232            Check for 'false positives': {}\
233             wrong positive 'Has' results on 2**16 entries => {} %%\n",
234                 bf.bitset.len() << 6, cnt, (cnt) as f64 / (N) as f64)
235    }
236
237    #[test]
238    fn test_has() {
239        let mut bf = Bloom::new((N * 10) as f64, 7.0);
240
241        let v = bf.has(18272025040905874063);
242        assert_eq!(v, false);
243
244        let _v = bf.has(18272025040905874063);
245        bf.add_if_not_has(18272025040905874063);
246        let v = bf.has(18272025040905874063);
247        assert_eq!(v, true)
248    }
249
250    #[test]
251    fn oprator_test() {
252        //  1 2 4 8 16 32 64
253        //0000=0 0001=1 0010=2 0100=4 1000 =8 011111
254        let a = 1; //01
255        let b = 2;
256        assert_eq!(a & b, 0);
257        assert_eq!(a | b, 3);
258        assert_eq!(a ^ b, 3);
259        assert_eq!(a << 4, 16);
260        assert_eq!(a >> b, 0);
261        assert_eq!(31 >> 4, 1);
262        assert_eq!(31 << 2, 124);
263        assert_eq!(31 >> 3, 3);
264    }
265}