1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
extern crate bytes; use bytes::{BufMut, BytesMut, Bytes, Buf}; fn get_u32_little_end(arr: &[u8]) -> u32 { ((arr[0] as u32) << 0) + ((arr[1] as u32) << 8) + ((arr[2] as u32) << 16) + ((arr[3] as u32) << 24) } fn hash(data: &Bytes, seed: u32) -> u32 { let m = 0xc6a4a793 as u32; let r = 24 as u32; let mut h = seed ^ (data.len() as u64 * m as u64) as u32; let n = data.len() - data.len() % 4; let mut i = 0; while i < n { h = (h as u64 + get_u32_little_end(&data[i..]) as u64) as u32; h = (h as u64 * m as u64) as u32; h ^= (h >> 16); i += 4; } let flag = data.len() - i; if flag == 3 { h += (data.len() as u32) << 16; } else if flag == 2 { h += (data.len() as u32) << 8; } else if flag == 1 { h += data[i] as u32; h = (h as u64 * m as u64) as u32; h ^= (h >> r); } return h } fn bloom_hash(data: &Bytes) -> u32 { hash(data, 0xbc9f1d34) } pub struct BloomFilter { bits_per_key: usize, k: u8, key_hashes: Vec<u32>, } impl BloomFilter { pub fn new(bits_per_key: usize) -> Self { let mut k = (bits_per_key as f64 * 0.69) as u8; if k < 1 { k = 1; } if k > 30 { k = 30; } BloomFilter { bits_per_key, k, key_hashes: vec![] } } pub fn contains(&self, filter: &Bytes, key: &Bytes) -> bool { let n_bytes = filter.len() - 1; if n_bytes < 1 { return false } let n_bits = (n_bytes * 8) as u32; let k = filter[n_bytes]; if k > 30 { return true } let mut kh = bloom_hash(key); let delta = (kh >> 17) | (kh << 15); for _ in 0..k { let bitpos = (kh % n_bits) as usize; if filter[bitpos/8] as u32 & (1 << (bitpos % 8)) == 0 { return false } kh = (kh as u64 + delta as u64) as u32; } return true } pub fn add(&mut self, key: &Bytes) { self.key_hashes.push(bloom_hash(key)) } pub fn generate(&mut self) -> Bytes { let mut n_bits = (self.key_hashes.len() * self.bits_per_key) as u32; if n_bits < 64 { n_bits = 64; } let n_bytes = (n_bits + 7) / 8; n_bits = n_bytes * 8; let mut dest = BytesMut::new(); dest.resize(n_bytes as usize + 1, 0); dest[n_bytes as usize] = self.k; for v in &self.key_hashes { let mut kh = v.clone(); let delta = (kh >> 17) | (kh << 15); for _ in 0..self.k { let bitpos = (kh % n_bits) as usize; dest[bitpos/8] |= (1 << (bitpos % 8)); kh = (kh as u64 + delta as u64) as u32; } } self.key_hashes.clear(); dest.freeze() } } #[cfg(test)] mod tests { use bytes::{Bytes, BytesMut, BufMut, Buf}; use crate::BloomFilter; fn num_to_bytes(num: u32) -> Bytes { let mut bs = BytesMut::new(); bs.put_u32_le(num); bs.freeze() } #[test] fn it_works() { let mut bf = BloomFilter::new(10); let n = 10000; for i in 0..n { bf.add(&num_to_bytes(i)) } let filter = bf.generate(); for i in 0..n { if !bf.contains(&filter, &num_to_bytes(i)) { panic!(format!("Error in {}", i)) } } let mut rate: f32 = 0.0; for i in 0..n { if bf.contains(&filter, &num_to_bytes(i + n + 1)) { rate += 1.0; } } rate /= n as f32; if rate > 0.02 { panic!(format!("False positive rate is more than 2%%, got {}, at len {}", rate, n)) } else { println!("False positive rate is {}", rate) } } }