use fnv::FnvHasher;
use murmur3::murmur3_32;
use std::hash::Hasher;
use std::io::Cursor;
pub type HashFunction = fn(&[u8], usize, usize) -> Vec<u32>;
pub(crate) fn hash_murmur32(key: &[u8]) -> u32 {
let mut cursor = Cursor::new(key);
murmur3_32(&mut cursor, 0).expect("Failed to compute Murmur3 hash")
}
pub(crate) fn hash_fnv32(key: &[u8]) -> u32 {
let mut hasher = FnvHasher::default();
hasher.write(key);
hasher.finish() as u32
}
pub fn default_hash_function(
item: &[u8],
num_hashes: usize,
capacity: usize,
) -> Vec<u32> {
let h1 = hash_murmur32(item);
let h2 = hash_fnv32(item);
(0..num_hashes)
.map(|i| h1.wrapping_add((i as u32).wrapping_mul(h2)) % capacity as u32)
.collect()
}
pub fn optimal_bit_vector_size(n: usize, fpr: f64) -> usize {
let ln2 = std::f64::consts::LN_2; ((-(n as f64) * fpr.ln()) / (ln2 * ln2)).ceil() as usize
}
pub fn optimal_num_hashes(n: usize, m: usize) -> usize {
((m as f64 / n as f64) * std::f64::consts::LN_2).round() as usize
}
#[allow(dead_code)]
pub fn calculate_level_fpr(
target_fpr: f64,
num_levels: usize,
active_ratio: f64,
) -> f64 {
let effective_levels = 1.0 + (num_levels - 1) as f64 * active_ratio;
1.0 - (1.0 - target_fpr).powf(1.0 / effective_levels)
}
#[allow(dead_code)]
pub fn calculate_optimal_params(
capacity: usize,
target_fpr: f64,
num_levels: usize,
active_ratio: f64,
) -> (f64, usize, usize) {
let level_fpr = calculate_level_fpr(target_fpr, num_levels, active_ratio);
let bit_vector_size = optimal_bit_vector_size(capacity, level_fpr);
let num_hashes = optimal_num_hashes(capacity, bit_vector_size);
(level_fpr, bit_vector_size, num_hashes)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_optimal_bit_vector_size() {
let m = optimal_bit_vector_size(10_000, 0.01);
assert!(
m > 90_000 && m < 100_000,
"Optimal size outside expected range: {m}"
);
let m = optimal_bit_vector_size(1_000, 0.001);
assert!(
m > 13_000 && m < 16_000,
"Optimal size outside expected range: {m}"
);
assert!(
optimal_bit_vector_size(1, 0.5) > 0,
"Even small values should produce positive bit size"
);
let m1 = optimal_bit_vector_size(1_000, 0.01);
let m2 = optimal_bit_vector_size(10_000, 0.01);
let ratio = m2 as f64 / m1 as f64;
assert!(
ratio > 9.0 && ratio < 11.0,
"Bit vector size should scale linearly with item count"
);
}
#[test]
fn test_optimal_num_hashes() {
let k = optimal_num_hashes(1_000, 10_000);
assert!(
(6..=8).contains(&k),
"Optimal hash count outside expected range: {k}"
);
let k1 = optimal_num_hashes(1_000, 10_000);
let k2 = optimal_num_hashes(1_000, 20_000);
let ratio = k2 as f64 / k1 as f64;
assert!(
ratio > 1.8 && ratio < 2.2,
"Hash count should scale with m/n ratio"
);
}
#[test]
fn test_hash_functions_distribution() {
let capacity = 10000;
let num_samples = 1000;
let mut distribution = vec![0; capacity];
let test_data: Vec<Vec<u8>> = (0..num_samples)
.map(|i| format!("test_data_{i}").into_bytes())
.collect();
for data in test_data {
let hashes = default_hash_function(&data, 1, capacity);
for hash in hashes {
distribution[hash as usize] += 1;
}
}
let mean = distribution.iter().sum::<i32>() as f64 / capacity as f64;
let non_zero = distribution.iter().filter(|&&x| x > 0).count();
let coverage = non_zero as f64 / capacity as f64;
assert!(
coverage > 0.05,
"Hash distribution coverage too low: {coverage}"
);
let expected_mean = num_samples as f64 / capacity as f64;
let mean_ratio = mean / expected_mean;
assert!(
mean_ratio > 0.8 && mean_ratio < 1.2,
"Mean distribution ratio outside expected range: {mean_ratio}"
);
}
}