probabilistic_rs/hash.rs
1use fnv::FnvHasher;
2use murmur3::murmur3_32;
3use std::hash::Hasher;
4use std::io::Cursor;
5
6/// A type alias for the hash function used in the Bloom filter.
7///
8/// This function takes an input item and computes multiple hash indices
9/// for the Bloom filter's bit vector.
10///
11/// **Parameters:**
12///
13/// - `item: &[u8]`
14/// - A byte slice representing the item to be hashed.
15/// - `num_hashes: usize`
16/// - The number of hash values to compute for the item.
17/// - `capacity: usize`
18/// - The size of the Bloom filter's bit vector. This ensures that
19/// the generated hash indices are within valid bounds.
20///
21/// **Returns:**
22///
23/// - `Vec<u32>`
24/// - A vector of hash indices corresponding to positions in the bit vector.
25///
26/// **Usage:**
27///
28/// The hash function computes `num_hashes` hash indices for the given `item`,
29/// ensuring each index is within the range `[0, capacity)`. These indices are
30/// used to set or check bits in the Bloom filter's bit vector.
31pub type HashFunction = fn(&[u8], usize, usize) -> Vec<u32>;
32
33pub(crate) fn hash_murmur32(key: &[u8]) -> u32 {
34 let mut cursor = Cursor::new(key);
35 murmur3_32(&mut cursor, 0).expect("Failed to compute Murmur3 hash")
36}
37
38pub(crate) fn hash_fnv32(key: &[u8]) -> u32 {
39 let mut hasher = FnvHasher::default();
40 hasher.write(key);
41 hasher.finish() as u32
42}
43
44/// Implements the default double-hashing scheme for Bloom filters.
45///
46/// This function uses a technique called "double hashing" to generate multiple hash values
47/// from just two independent hash functions. It's more efficient than computing
48/// completely separate hash functions for each index.
49///
50/// The formula used is: h(i) = (h1 + i * h2) mod capacity
51/// Where:
52/// - h1 is the first hash value (Murmur3)
53/// - h2 is the second hash value (FNV)
54/// - i ranges from 0 to num_hashes-1
55/// - capacity is the size of the bit vector
56///
57/// This approach provides good distribution while being computationally efficient.
58/// The wrapping operations prevent integer overflow on large values.
59///
60/// Parameters:
61/// - `item`: The byte slice to hash
62/// - `num_hashes`: The number of hash values to generate
63/// - `capacity`: The size of the bit vector (used for modulo)
64///
65/// Returns:
66/// A vector of `num_hashes` hash values, each in the range [0, capacity-1]
67pub fn default_hash_function(
68 item: &[u8],
69 num_hashes: usize,
70 capacity: usize,
71) -> Vec<u32> {
72 let h1 = hash_murmur32(item);
73 let h2 = hash_fnv32(item);
74 (0..num_hashes)
75 .map(|i| h1.wrapping_add((i as u32).wrapping_mul(h2)) % capacity as u32)
76 .collect()
77}
78
79/// Calculates the optimal bit vector size for a Bloom filter.
80///
81/// This function determines the ideal size of the bit array to achieve the target
82/// false positive rate (FPR) for a given number of elements.
83///
84/// The formula used is: m = -n * ln(fpr) / (ln(2)^2)
85/// Where:
86/// - m is the optimal bit vector size
87/// - n is the expected number of elements
88/// - fpr is the target false positive rate (between 0 and 1)
89/// - ln(2) is the natural logarithm of 2
90///
91/// Mathematical derivation:
92/// 1. The optimal false positive rate for a Bloom filter is (1 - e^(-kn/m))^k
93/// 2. When k = (m/n)*ln(2), this expression is minimized
94/// 3. At this optimal k, the false positive rate is approximately (0.6185)^(m/n)
95/// 4. Solving for m gives us the formula implemented here
96///
97/// Parameters:
98/// - `n`: The expected number of elements to be inserted
99/// - `fpr`: The target false positive rate (e.g., 0.01 for 1%)
100///
101/// Returns:
102/// The optimal size of the bit vector as a number of bits
103pub fn optimal_bit_vector_size(n: usize, fpr: f64) -> usize {
104 let ln2 = std::f64::consts::LN_2; // Natural logarithm of 2 (≈0.693)
105 ((-(n as f64) * fpr.ln()) / (ln2 * ln2)).ceil() as usize
106}
107
108/// Calculates the optimal number of hash functions for a Bloom filter.
109///
110/// This function determines the ideal number of hash functions to minimize
111/// the false positive rate for given bit vector size and element count.
112///
113/// The formula used is: k = (m/n) * ln(2)
114/// Where:
115/// - k is the optimal number of hash functions
116/// - m is the bit vector size
117/// - n is the expected number of elements
118/// - ln(2) is the natural logarithm of 2
119///
120/// Mathematical basis:
121/// 1. Each hash function sets one bit in the vector
122/// 2. The probability of a bit still being 0 after all n elements are inserted is (1 - 1/m)^(k*n)
123/// 3. This approximates to e^(-kn/m) for large m
124/// 4. The false positive probability is minimized when k = (m/n)*ln(2)
125///
126/// Parameters:
127/// - `n`: The expected number of elements to be inserted
128/// - `m`: The size of the bit vector
129///
130/// Returns:
131/// The optimal number of hash functions to use
132pub fn optimal_num_hashes(n: usize, m: usize) -> usize {
133 ((m as f64 / n as f64) * std::f64::consts::LN_2).round() as usize
134}
135
136/// Calculates the per-level false positive rate needed to achieve the target
137/// overall false positive rate in a multi-level Bloom filter.
138///
139/// In a multi-level filter where a query returns true if any level reports true,
140/// the overall false positive rate is higher than the per-level rate.
141///
142/// Parameters:
143/// - `target_fpr`: The desired overall false positive rate
144/// - `num_levels`: The maximum number of levels in the filter
145/// - `active_ratio`: Estimated fraction of levels that are typically active (0.0-1.0)
146///
147/// Returns:
148/// The adjusted per-level false positive rate
149#[allow(dead_code)]
150pub fn calculate_level_fpr(
151 target_fpr: f64,
152 num_levels: usize,
153 active_ratio: f64,
154) -> f64 {
155 // Calculate effective number of levels based on typical activity
156 let effective_levels = 1.0 + (num_levels - 1) as f64 * active_ratio;
157 // Calculate adjusted level FPR
158 1.0 - (1.0 - target_fpr).powf(1.0 / effective_levels)
159}
160
161/// Calculates optimal Bloom filter parameters based on capacity and target FPR.
162///
163/// This function handles the common initialization logic for Bloom filters,
164/// including FPR adjustment for multi-level filters.
165///
166/// Parameters:
167/// - `capacity`: The maximum number of elements the filter should hold
168/// - `target_fpr`: The desired overall false positive rate
169/// - `num_levels`: The maximum number of levels in the filter
170/// - `active_ratio`: Estimated fraction of levels that are typically active (0.0-1.0)
171///
172/// Returns:
173/// A tuple containing (adjusted_level_fpr, optimal_bit_vector_size, optimal_num_hashes)
174#[allow(dead_code)]
175pub fn calculate_optimal_params(
176 capacity: usize,
177 target_fpr: f64,
178 num_levels: usize,
179 active_ratio: f64,
180) -> (f64, usize, usize) {
181 // Adjust FPR for multi-level filter
182 let level_fpr = calculate_level_fpr(target_fpr, num_levels, active_ratio);
183
184 // Calculate optimal bit vector size for the adjusted FPR
185 let bit_vector_size = optimal_bit_vector_size(capacity, level_fpr);
186
187 // Calculate optimal number of hash functions
188 let num_hashes = optimal_num_hashes(capacity, bit_vector_size);
189
190 // TODO: probably need to remove level_fpr
191 (level_fpr, bit_vector_size, num_hashes)
192}
193
194#[cfg(test)]
195mod tests {
196 use super::*;
197
198 #[test]
199 fn test_optimal_bit_vector_size() {
200 // Test with known values from literature
201 // For 10,000 items with 1% FPR, optimal size should be around 95,850 bits
202 let m = optimal_bit_vector_size(10_000, 0.01);
203 assert!(
204 m > 90_000 && m < 100_000,
205 "Optimal size outside expected range: {m}"
206 );
207
208 // For 1,000 items with 0.1% FPR, optimal size should be around 14,400 bits
209 let m = optimal_bit_vector_size(1_000, 0.001);
210 assert!(
211 m > 13_000 && m < 16_000,
212 "Optimal size outside expected range: {m}"
213 );
214
215 // Test boundary values
216 assert!(
217 optimal_bit_vector_size(1, 0.5) > 0,
218 "Even small values should produce positive bit size"
219 );
220
221 // Test scaling property - 10x items should need ~10x space for same FPR
222 let m1 = optimal_bit_vector_size(1_000, 0.01);
223 let m2 = optimal_bit_vector_size(10_000, 0.01);
224 let ratio = m2 as f64 / m1 as f64;
225 assert!(
226 ratio > 9.0 && ratio < 11.0,
227 "Bit vector size should scale linearly with item count"
228 );
229 }
230
231 #[test]
232 fn test_optimal_num_hashes() {
233 // Test with known values from literature
234 // For m/n = 10, optimal k should be around 7
235 let k = optimal_num_hashes(1_000, 10_000);
236 assert!(
237 (6..=8).contains(&k),
238 "Optimal hash count outside expected range: {k}"
239 );
240
241 // Test scaling property - doubling m/n should roughly double k
242 let k1 = optimal_num_hashes(1_000, 10_000);
243 let k2 = optimal_num_hashes(1_000, 20_000);
244 let ratio = k2 as f64 / k1 as f64;
245 assert!(
246 ratio > 1.8 && ratio < 2.2,
247 "Hash count should scale with m/n ratio"
248 );
249 }
250
251 #[test]
252 fn test_hash_functions_distribution() {
253 let capacity = 10000;
254 let num_samples = 1000;
255 let mut distribution = vec![0; capacity];
256
257 // Generate random test data
258 let test_data: Vec<Vec<u8>> = (0..num_samples)
259 .map(|i| format!("test_data_{i}").into_bytes())
260 .collect();
261
262 // Generate hash values using default_hash_function
263 for data in test_data {
264 let hashes = default_hash_function(&data, 1, capacity);
265 for hash in hashes {
266 distribution[hash as usize] += 1;
267 }
268 }
269
270 // Calculate statistics to verify distribution
271 let mean = distribution.iter().sum::<i32>() as f64 / capacity as f64;
272
273 // Count non-zero buckets (should be reasonably high for good distribution)
274 let non_zero = distribution.iter().filter(|&&x| x > 0).count();
275 let coverage = non_zero as f64 / capacity as f64;
276
277 // For good hash functions with 1000 samples in 10000 buckets, we expect roughly 10% coverage
278 assert!(
279 coverage > 0.05,
280 "Hash distribution coverage too low: {coverage}"
281 );
282
283 // Verify the mean is reasonable (should be close to num_samples/capacity)
284 let expected_mean = num_samples as f64 / capacity as f64;
285 let mean_ratio = mean / expected_mean;
286 assert!(
287 mean_ratio > 0.8 && mean_ratio < 1.2,
288 "Mean distribution ratio outside expected range: {mean_ratio}"
289 );
290 }
291}