Skip to main content

probabilistic_rs/
hash.rs

1use fnv::FnvHasher;
2use murmur3::murmur3_32;
3use std::hash::Hasher;
4use std::io::Cursor;
5
6/// A type alias for the hash function used in the Bloom filter.
7///
8/// This function takes an input item and computes multiple hash indices
9/// for the Bloom filter's bit vector.
10///
11/// **Parameters:**
12///
13/// - `item: &[u8]`
14///   - A byte slice representing the item to be hashed.
15/// - `num_hashes: usize`
16///   - The number of hash values to compute for the item.
17/// - `capacity: usize`
18///   - The size of the Bloom filter's bit vector. This ensures that
19///     the generated hash indices are within valid bounds.
20///
21/// **Returns:**
22///
23/// - `Vec<u32>`
24///   - A vector of hash indices corresponding to positions in the bit vector.
25///
26/// **Usage:**
27///
28/// The hash function computes `num_hashes` hash indices for the given `item`,
29/// ensuring each index is within the range `[0, capacity)`. These indices are
30/// used to set or check bits in the Bloom filter's bit vector.
31pub type HashFunction = fn(&[u8], usize, usize) -> Vec<u32>;
32
33pub(crate) fn hash_murmur32(key: &[u8]) -> u32 {
34    let mut cursor = Cursor::new(key);
35    murmur3_32(&mut cursor, 0).expect("Failed to compute Murmur3 hash")
36}
37
38pub(crate) fn hash_fnv32(key: &[u8]) -> u32 {
39    let mut hasher = FnvHasher::default();
40    hasher.write(key);
41    hasher.finish() as u32
42}
43
44/// Implements the default double-hashing scheme for Bloom filters.
45///
46/// This function uses a technique called "double hashing" to generate multiple hash values
47/// from just two independent hash functions. It's more efficient than computing
48/// completely separate hash functions for each index.
49///
50/// The formula used is: h(i) = (h1 + i * h2) mod capacity
51/// Where:
52/// - h1 is the first hash value (Murmur3)
53/// - h2 is the second hash value (FNV)
54/// - i ranges from 0 to num_hashes-1
55/// - capacity is the size of the bit vector
56///
57/// This approach provides good distribution while being computationally efficient.
58/// The wrapping operations prevent integer overflow on large values.
59///
60/// Parameters:
61/// - `item`: The byte slice to hash
62/// - `num_hashes`: The number of hash values to generate
63/// - `capacity`: The size of the bit vector (used for modulo)
64///
65/// Returns:
66/// A vector of `num_hashes` hash values, each in the range [0, capacity-1]
67pub fn default_hash_function(
68    item: &[u8],
69    num_hashes: usize,
70    capacity: usize,
71) -> Vec<u32> {
72    let h1 = hash_murmur32(item);
73    let h2 = hash_fnv32(item);
74    (0..num_hashes)
75        .map(|i| h1.wrapping_add((i as u32).wrapping_mul(h2)) % capacity as u32)
76        .collect()
77}
78
79/// Calculates the optimal bit vector size for a Bloom filter.
80///
81/// This function determines the ideal size of the bit array to achieve the target
82/// false positive rate (FPR) for a given number of elements.
83///
84/// The formula used is: m = -n * ln(fpr) / (ln(2)^2)
85/// Where:
86/// - m is the optimal bit vector size
87/// - n is the expected number of elements
88/// - fpr is the target false positive rate (between 0 and 1)
89/// - ln(2) is the natural logarithm of 2
90///
91/// Mathematical derivation:
92/// 1. The optimal false positive rate for a Bloom filter is (1 - e^(-kn/m))^k
93/// 2. When k = (m/n)*ln(2), this expression is minimized
94/// 3. At this optimal k, the false positive rate is approximately (0.6185)^(m/n)
95/// 4. Solving for m gives us the formula implemented here
96///
97/// Parameters:
98/// - `n`: The expected number of elements to be inserted
99/// - `fpr`: The target false positive rate (e.g., 0.01 for 1%)
100///
101/// Returns:
102/// The optimal size of the bit vector as a number of bits
103pub fn optimal_bit_vector_size(n: usize, fpr: f64) -> usize {
104    let ln2 = std::f64::consts::LN_2; // Natural logarithm of 2 (≈0.693)
105    ((-(n as f64) * fpr.ln()) / (ln2 * ln2)).ceil() as usize
106}
107
108/// Calculates the optimal number of hash functions for a Bloom filter.
109///
110/// This function determines the ideal number of hash functions to minimize
111/// the false positive rate for given bit vector size and element count.
112///
113/// The formula used is: k = (m/n) * ln(2)
114/// Where:
115/// - k is the optimal number of hash functions
116/// - m is the bit vector size
117/// - n is the expected number of elements
118/// - ln(2) is the natural logarithm of 2
119///
120/// Mathematical basis:
121/// 1. Each hash function sets one bit in the vector
122/// 2. The probability of a bit still being 0 after all n elements are inserted is (1 - 1/m)^(k*n)
123/// 3. This approximates to e^(-kn/m) for large m
124/// 4. The false positive probability is minimized when k = (m/n)*ln(2)
125///
126/// Parameters:
127/// - `n`: The expected number of elements to be inserted
128/// - `m`: The size of the bit vector
129///
130/// Returns:
131/// The optimal number of hash functions to use
132pub fn optimal_num_hashes(n: usize, m: usize) -> usize {
133    ((m as f64 / n as f64) * std::f64::consts::LN_2).round() as usize
134}
135
136/// Calculates the per-level false positive rate needed to achieve the target
137/// overall false positive rate in a multi-level Bloom filter.
138///
139/// In a multi-level filter where a query returns true if any level reports true,
140/// the overall false positive rate is higher than the per-level rate.
141///
142/// Parameters:
143/// - `target_fpr`: The desired overall false positive rate
144/// - `num_levels`: The maximum number of levels in the filter
145/// - `active_ratio`: Estimated fraction of levels that are typically active (0.0-1.0)
146///
147/// Returns:
148/// The adjusted per-level false positive rate
149#[allow(dead_code)]
150pub fn calculate_level_fpr(
151    target_fpr: f64,
152    num_levels: usize,
153    active_ratio: f64,
154) -> f64 {
155    // Calculate effective number of levels based on typical activity
156    let effective_levels = 1.0 + (num_levels - 1) as f64 * active_ratio;
157    // Calculate adjusted level FPR
158    1.0 - (1.0 - target_fpr).powf(1.0 / effective_levels)
159}
160
161/// Calculates optimal Bloom filter parameters based on capacity and target FPR.
162///
163/// This function handles the common initialization logic for Bloom filters,
164/// including FPR adjustment for multi-level filters.
165///
166/// Parameters:
167/// - `capacity`: The maximum number of elements the filter should hold
168/// - `target_fpr`: The desired overall false positive rate
169/// - `num_levels`: The maximum number of levels in the filter
170/// - `active_ratio`: Estimated fraction of levels that are typically active (0.0-1.0)
171///
172/// Returns:
173/// A tuple containing (adjusted_level_fpr, optimal_bit_vector_size, optimal_num_hashes)
174#[allow(dead_code)]
175pub fn calculate_optimal_params(
176    capacity: usize,
177    target_fpr: f64,
178    num_levels: usize,
179    active_ratio: f64,
180) -> (f64, usize, usize) {
181    // Adjust FPR for multi-level filter
182    let level_fpr = calculate_level_fpr(target_fpr, num_levels, active_ratio);
183
184    // Calculate optimal bit vector size for the adjusted FPR
185    let bit_vector_size = optimal_bit_vector_size(capacity, level_fpr);
186
187    // Calculate optimal number of hash functions
188    let num_hashes = optimal_num_hashes(capacity, bit_vector_size);
189
190    // TODO: probably need to remove level_fpr
191    (level_fpr, bit_vector_size, num_hashes)
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    #[test]
199    fn test_optimal_bit_vector_size() {
200        // Test with known values from literature
201        // For 10,000 items with 1% FPR, optimal size should be around 95,850 bits
202        let m = optimal_bit_vector_size(10_000, 0.01);
203        assert!(
204            m > 90_000 && m < 100_000,
205            "Optimal size outside expected range: {m}"
206        );
207
208        // For 1,000 items with 0.1% FPR, optimal size should be around 14,400 bits
209        let m = optimal_bit_vector_size(1_000, 0.001);
210        assert!(
211            m > 13_000 && m < 16_000,
212            "Optimal size outside expected range: {m}"
213        );
214
215        // Test boundary values
216        assert!(
217            optimal_bit_vector_size(1, 0.5) > 0,
218            "Even small values should produce positive bit size"
219        );
220
221        // Test scaling property - 10x items should need ~10x space for same FPR
222        let m1 = optimal_bit_vector_size(1_000, 0.01);
223        let m2 = optimal_bit_vector_size(10_000, 0.01);
224        let ratio = m2 as f64 / m1 as f64;
225        assert!(
226            ratio > 9.0 && ratio < 11.0,
227            "Bit vector size should scale linearly with item count"
228        );
229    }
230
231    #[test]
232    fn test_optimal_num_hashes() {
233        // Test with known values from literature
234        // For m/n = 10, optimal k should be around 7
235        let k = optimal_num_hashes(1_000, 10_000);
236        assert!(
237            (6..=8).contains(&k),
238            "Optimal hash count outside expected range: {k}"
239        );
240
241        // Test scaling property - doubling m/n should roughly double k
242        let k1 = optimal_num_hashes(1_000, 10_000);
243        let k2 = optimal_num_hashes(1_000, 20_000);
244        let ratio = k2 as f64 / k1 as f64;
245        assert!(
246            ratio > 1.8 && ratio < 2.2,
247            "Hash count should scale with m/n ratio"
248        );
249    }
250
251    #[test]
252    fn test_hash_functions_distribution() {
253        let capacity = 10000;
254        let num_samples = 1000;
255        let mut distribution = vec![0; capacity];
256
257        // Generate random test data
258        let test_data: Vec<Vec<u8>> = (0..num_samples)
259            .map(|i| format!("test_data_{i}").into_bytes())
260            .collect();
261
262        // Generate hash values using default_hash_function
263        for data in test_data {
264            let hashes = default_hash_function(&data, 1, capacity);
265            for hash in hashes {
266                distribution[hash as usize] += 1;
267            }
268        }
269
270        // Calculate statistics to verify distribution
271        let mean = distribution.iter().sum::<i32>() as f64 / capacity as f64;
272
273        // Count non-zero buckets (should be reasonably high for good distribution)
274        let non_zero = distribution.iter().filter(|&&x| x > 0).count();
275        let coverage = non_zero as f64 / capacity as f64;
276
277        // For good hash functions with 1000 samples in 10000 buckets, we expect roughly 10% coverage
278        assert!(
279            coverage > 0.05,
280            "Hash distribution coverage too low: {coverage}"
281        );
282
283        // Verify the mean is reasonable (should be close to num_samples/capacity)
284        let expected_mean = num_samples as f64 / capacity as f64;
285        let mean_ratio = mean / expected_mean;
286        assert!(
287            mean_ratio > 0.8 && mean_ratio < 1.2,
288            "Mean distribution ratio outside expected range: {mean_ratio}"
289        );
290    }
291}