use std::sync::Mutex;
use std::{io::Cursor, ops};
use byteorder::ReadBytesExt;
#[cfg(feature = "md4_hash")]
use md4::Digest;
#[cfg(feature = "md5_hash")]
use md5::compute as md5_compute;
use num::{cast::AsPrimitive, Bounded, FromPrimitive, ToPrimitive, Unsigned, Zero};
use rand::{rngs::OsRng, Rng};
use rayon::{iter::repeatn, prelude::*};
#[cfg(feature = "serde_support")]
use serde::{Deserialize, Serialize};
#[cfg(feature = "metrics")]
use crate::metrics::Metrics;
use crate::{
error::Error,
types::{HashFunction, Salt},
};
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))]
pub struct SBF<U>
where
U: Unsigned + Bounded + Clone + Copy + PartialOrd + Eq,
{
salts: Vec<Salt>,
pub(crate) filter: Vec<U>,
hash_function: HashFunction,
#[cfg(feature = "metrics")]
pub metrics: Metrics,
}
impl<U> SBF<U>
where
U: 'static
+ Send
+ Sync
+ Clone
+ Copy
+ Ord
+ PartialOrd
+ Eq
+ Unsigned
+ Bounded
+ Zero
+ FromPrimitive
+ ToPrimitive
+ ops::AddAssign
+ ops::SubAssign,
usize: num::cast::AsPrimitive<U>,
{
fn hash(&self, buff: &[u8]) -> Vec<u8> {
match &self.hash_function {
#[cfg(feature = "md5_hash")]
HashFunction::MD5 => md5_compute(buff).to_vec(),
#[cfg(feature = "md4_hash")]
HashFunction::MD4 => md4::Md4::digest(buff).to_vec(),
}
}
fn calc_indexes(&self, content: Vec<u8>) -> Vec<U> {
self.salts
.par_iter()
.map(|salt: &Salt| {
let salt_iterator = salt.par_iter();
let zeros = repeatn(&(0_u8), salt.len());
let content = content.par_iter().chain(zeros);
let xor_content: Vec<u8> = content.zip(salt_iterator).map(|(h, v)| h ^ v).collect();
let digest = self.hash(&xor_content).drain(0..8).collect::<Vec<u8>>();
let digest_value = Cursor::new(digest)
.read_u64::<byteorder::NativeEndian>()
.unwrap();
#[allow(clippy::integer_arithmetic)]
(digest_value as usize % self.filter.len()).as_()
})
.collect::<Vec<U>>()
}
fn get_cell(&self, index: U) -> Result<&U, Error> {
self.filter
.get(index.to_usize().unwrap())
.ok_or(Error::IndexOutOfBounds)
}
fn set_cell(&mut self, index: U, area: U) -> Result<&U, Error> {
if let Some(v) = self.filter.get_mut(index.to_usize().unwrap()) {
if *v == U::zero() || *v < area {
*v = area;
} else if *v >= area {
}
#[cfg(feature = "metrics")]
#[allow(clippy::integer_arithmetic)]
{
if *v == U::zero() {
self.metrics.area_cells[area.to_usize().unwrap()] += 1;
} else if *v < area {
if area > U::zero() {
self.metrics.area_cells[v.to_usize().unwrap()] -= 1;
}
self.metrics.area_cells[area.to_usize().unwrap()] += 1;
self.metrics.collisions += 1;
} else if *v == area {
self.metrics.collisions += 1;
self.metrics.area_self_collisions[v.to_usize().unwrap()] += 1;
} else if *v > area {
self.metrics.collisions += 1;
}
}
Ok(v)
} else {
Err(Error::IndexOutOfBounds)
}
}
#[allow(unused_variables)]
pub fn new(
cells: U,
hash_number: usize,
max_input_size: usize,
hash_function: HashFunction,
area_number: U,
) -> Result<Self, Error> {
assert!(cells > U::zero());
let rng = Mutex::new(OsRng);
let salts = (0..hash_number)
.into_par_iter()
.map(|_| {
(0..max_input_size)
.into_par_iter()
.map(|_| rng.lock().unwrap().gen())
.collect::<Salt>()
})
.collect::<Vec<Salt>>();
Ok(SBF {
filter: vec![U::zero(); cells.to_usize().ok_or(Error::IndexOutOfBounds)?],
hash_function,
salts,
#[cfg(feature = "metrics")]
metrics: Metrics {
cells: cells.to_usize().unwrap(),
hash_number,
members: 0,
collisions: 0,
safeness: 0.0,
area_number: area_number.to_usize().unwrap(),
area_members: vec![0; area_number.to_usize().ok_or(Error::IndexOutOfBounds)?],
area_cells: vec![0; area_number.to_usize().ok_or(Error::IndexOutOfBounds)?],
area_expected_cells: vec![
-1;
area_number.to_usize().ok_or(Error::IndexOutOfBounds)?
],
area_self_collisions: vec![
0;
area_number.to_usize().ok_or(Error::IndexOutOfBounds)?
],
area_fpp: vec![-1.0; area_number.to_usize().ok_or(Error::IndexOutOfBounds)?],
area_isep: vec![-1.0; area_number.to_usize().ok_or(Error::IndexOutOfBounds)?],
area_prior_fpp: vec![-1.0; area_number.to_usize().ok_or(Error::IndexOutOfBounds)?],
area_prior_isep: vec![-1.0; area_number.to_usize().ok_or(Error::IndexOutOfBounds)?],
area_prior_safep: vec![
-1.0;
area_number.to_usize().ok_or(Error::IndexOutOfBounds)?
],
},
})
}
pub fn new_optimal(
expected_inserts: usize,
area_number: U,
max_fpp: f64,
max_input_size: usize,
hash_function: HashFunction,
) -> Result<Self, Error> {
let optimal_cells =
(-(expected_inserts as f64) * max_fpp.ln() / (2.0f64.ln().powi(2))) as u64;
let hash_number =
(optimal_cells as f64 / (expected_inserts as f64) * 2.0f64.ln()).ceil() as usize;
Self::new(
U::from_u64(optimal_cells).ok_or(Error::IndexOutOfBounds)?,
hash_number,
max_input_size,
hash_function,
area_number,
)
}
pub fn check(&self, content: Vec<u8>) -> Result<&U, Error> {
self.calc_indexes(content)
.par_iter()
.map(|i| self.get_cell(*i))
.try_reduce_with(|a, b| Ok(a.min(b)))
.expect("Some value, since the iterator is not empty")
}
pub fn insert(&mut self, content: Vec<u8>, area: U) -> Result<(), Error> {
self.calc_indexes(content)
.iter()
.try_for_each(|i| self.set_cell(*i, area).map(|_| ()))
.map(|_| {
#[cfg(feature = "metrics")]
#[allow(clippy::integer_arithmetic)]
{
self.metrics.members += 1;
self.metrics.area_members[area.to_usize().unwrap()] += 1;
};
})
}
}