1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
pub const NUM_HASHES: u32 = 3;
/// Threshold for allocating the exact-pairs table (64KB).
///
/// The exact-pairs table provides O(1) zero-FPR lookups for all 65,536 possible
/// 2-byte n-grams. It's allocated when the bloom filter has ≥4096 bits because:
///
/// 1. At 4096 bits, the bloom uses 512 bytes; adding 64KB is a 128x increase
/// 2. The table eliminates ALL false positives for 2-byte queries
/// 3. Lookup is a single array index vs 3 hash probes
///
/// For internet-scale workloads (10K+ patterns × 1M+ files), this 64KB
/// investment pays off in reduced FPR and faster rejection of non-matching files.
///
/// NOTE: When using `NgramBloom::from_block` with exactly 4096 bits, the exact
/// pairs table IS allocated and used. However, after serialization via `raw_parts`
/// and deserialization via `from_raw_parts`, the exact-pairs table is NOT
/// reconstructed (it falls back to hash-based lookups). For full performance
/// with exact pairs, rebuild the bloom from raw data instead of deserializing.
pub const EXACT_PAIR_THRESHOLD_BITS: usize = 4096;
/// Number of u64 words in the exact-pairs table (65,536 bits = 8KB).
pub const EXACT_PAIR_WORDS: usize = 65_536 / 64;
/// A bloom filter tracking which n-grams (2-byte sequences) appear in a block.
///
/// Uses double hashing with k=3 for efficient membership testing.
/// For large filters (≥4096 bits), an exact 16-bit pair table provides
/// zero false positives for the 65,536 possible 2-byte sequences.
/// Cache-line-local blocked bloom filter for 2-byte n-grams.
///
/// Each n-gram maps to a single 512-bit block so all probes stay inside one
/// 64-byte cache line.