use alloc::vec::Vec;
use core::f64::consts::LN_2;
use vpb::kvstructs::bytes::{BufMut, Bytes, BytesMut};
const SEED: u32 = 0xbc9f1d34;
const M: u32 = 0xc6a4a793;
pub fn hash(b: &[u8]) -> u32 {
let mut h = SEED ^ M.wrapping_mul(b.len() as u32);
let mut c = b;
while c.len() >= 4 {
h = h.wrapping_add(
(c[0] as u32)
| (c[1] as u32).wrapping_shl(8)
| (c[2] as u32).wrapping_shl(16)
| (c[3] as u32).wrapping_shl(24),
);
h = h.wrapping_mul(M);
h ^= h.wrapping_shr(16);
c = &c[4..];
}
match c.len() {
3 => {
h = h.wrapping_add((c[2] as u32) << 16);
h = h.wrapping_add((c[1] as u32) << 8);
h = h.wrapping_add(c[0] as u32);
h = h.wrapping_mul(M);
h ^= h.wrapping_shr(24);
}
2 => {
h = h.wrapping_add((c[1] as u32) << 8);
h = h.wrapping_add(c[0] as u32);
h = h.wrapping_mul(M);
h ^= h.wrapping_shr(24);
}
1 => {
h += c[0] as u32;
h = h.wrapping_mul(M);
h ^= h.wrapping_shr(24);
}
_ => {}
}
h
}
pub fn bloom_bits_per_key(num_entries: usize, fp: f64) -> usize {
let fne = num_entries as f64;
let size = -1f64 * fne * fp.ln() / LN_2.powi(2);
let locs = LN_2.ceil() * size / fne;
locs as usize
}
#[derive(Debug, Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
#[repr(transparent)]
pub struct Filter(Bytes);
impl core::ops::Deref for Filter {
type Target = Bytes;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl Default for Filter {
fn default() -> Self {
Self(Bytes::new())
}
}
impl AsRef<[u8]> for Filter {
fn as_ref(&self) -> &[u8] {
self.0.as_ref()
}
}
impl Filter {
pub fn new(keys: &[u32], bits_per_key: usize) -> Self {
let k = ((0.69 * bits_per_key as f64) as u32).max(1).min(30);
let mut nbs = (keys.len() * bits_per_key).max(64);
let nbytes = (nbs + 7) / 8;
nbs = nbytes * 8;
let mut filter = BytesMut::with_capacity(nbytes + 1);
filter.resize(nbytes, 0);
for h in keys {
let mut h = *h;
let delta = h >> 17 | h << 15;
(0..k).for_each(|_| {
let bit_pos = h % (nbs as u32);
filter[(bit_pos / 8) as usize] |= 1 << (bit_pos % 8);
h = h.wrapping_add(delta);
});
}
filter.put_u8(k as u8);
Filter(filter.freeze())
}
#[inline]
pub const fn len(&self) -> usize {
self.0.len()
}
#[inline]
pub const fn is_empty(&self) -> bool {
self.0.is_empty()
}
#[inline]
pub const fn cap(&self) -> usize {
self.0.len()
}
#[inline]
pub fn into_vec(self) -> Vec<u8> {
self.0.to_vec()
}
#[inline]
pub fn into_bytes(self) -> Bytes {
self.0
}
}
pub trait MayContain {
#[inline]
fn may_contain(&self, mut h: u32) -> bool
where
Self: AsRef<[u8]>,
{
let slice = self.as_ref();
let len = slice.len();
if len < 2 {
false
} else {
let k = slice[len - 1];
if k > 30 {
true
} else {
let nbs = (8 * (len - 1)) as u32;
let delta = h >> 17 | h << 15;
for _ in 0..k {
let bit_pos = h % nbs;
if slice[(bit_pos / 8) as usize] & (1 << (bit_pos % 8)) == 0 {
return false;
}
h = h.wrapping_add(delta);
}
true
}
}
}
#[inline]
fn may_contain_key(&self, k: &[u8]) -> bool
where
Self: AsRef<[u8]>,
{
self.may_contain(hash(k))
}
}
impl<T> MayContain for T {}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
impl ToString for Filter {
fn to_string(&self) -> String {
let mut s = vec!["."; 8 * self.len()];
for (i, x) in self.0.iter().enumerate() {
for j in 0..8 {
if *x & (1 << j) != 0 {
s[8 * i + j] = "1";
}
}
}
s.join("")
}
}
#[test]
fn test_small_filter() {
let mut hashes = Vec::new();
for word in &["hello".as_bytes(), "world".as_bytes()] {
hashes.push(hash(word));
}
let f = Filter::new(hashes.as_slice(), 10);
assert_eq!(
"1...1.........1.........1.....1...1...1.....1.........1.....1....11.....",
f.to_string()
);
let mut m = HashMap::new();
m.insert("hello", true);
m.insert("world", true);
m.insert("x", false);
m.insert("foo", false);
for (k, v) in m {
assert_eq!(f.may_contain_key(k.as_bytes()), v);
}
}
#[test]
fn test_filter() {
fn nxt_len(x: usize) -> usize {
if x < 10 {
return x + 1;
}
if x < 100 {
return x + 10;
}
if x < 1000 {
return x + 100;
}
x + 1000
}
fn le32(i: usize) -> Vec<u8> {
let mut b = Vec::new();
let i = i as u32;
b.push(i as u8);
b.push((i >> 8) as u8);
b.push((i >> 16) as u8);
b.push((i >> 24) as u8);
b
}
let (mut num_mediocre_filters, mut num_good_filters) = (0, 0);
'outer: loop {
let mut len = 1;
while len <= 10_000 {
let keys = (0..len).map(le32).collect::<Vec<_>>();
let hashes = keys
.iter()
.map(|key| hash(key.as_slice()))
.collect::<Vec<_>>();
let f = Filter::new(hashes.as_slice(), 10);
if f.len() > (len * 10 / 8) + 40 {
eprintln!("len={}: f.len()={} is too large", len, f.len());
continue;
}
for k in &keys {
if !f.may_contain_key(k.as_slice()) {
eprintln!("len={}: did not contain key {:?}", len, k);
continue 'outer;
}
}
let mut num_false_positive = 0;
for i in 0..10000 {
if f.may_contain_key(le32(1e9 as usize + i).as_slice()) {
num_false_positive += 1;
}
}
if num_false_positive > (0.02 * 10_000f64) as usize {
eprintln!(
"len={}: {} false positives in 10000",
len, num_false_positive
);
continue;
}
if num_false_positive > (0.0125 * 10_000f64) as usize {
num_mediocre_filters += 1;
} else {
num_good_filters += 1;
}
len = nxt_len(len);
}
break;
}
assert!(
num_mediocre_filters <= num_good_filters / 5,
"{} mediocre filters but only {} good filters",
num_mediocre_filters,
num_good_filters
);
}
#[test]
fn test_hash() {
let mut m = HashMap::new();
m.insert("", 3164544308);
m.insert("r", 1499315714);
m.insert("ru", 2735451247);
m.insert("rus", 1743522570);
m.insert("rust", 3428276233);
m.insert("rusta", 2505947263);
m.insert("rustac", 1658958784);
m.insert("rustace", 1755101662);
m.insert("rustacea", 2293694788);
m.insert("rustacean", 2757417412);
m.insert("I had a dream it would end this way.", 0xe14a9db9);
for (k, v) in m {
assert_eq!(hash(k.as_bytes()), v, "key: {}", k);
}
}
}