1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//! Bigram-bloom prefilter - Layer 0.5 between alphabet screening and AC/HS.
//!
//! `AlphabetMask` (Layer 0) tells us which BYTES appear in the chunk; it can't
//! tell us about adjacencies. A 1 MB Java source file likely contains
//! `g`, `h`, `p`, `_` somewhere (Layer 0 says "scan it") but never the bigram
//! `gh` followed by `p_` (which the GitHub PAT prefix `ghp_` requires).
//!
//! This module builds two 4096-bit (512-byte) bloom filters at compile time:
//! * `LITERAL_BIGRAM_BLOOM` - the union of bigrams across all detector
//! literal prefixes plus an extension by one ASCII byte (so a 4-char
//! prefix `ghp_` contributes 5 bigrams: `gh`, `hp`, `p_`, `_x` for every
//! plausible follower).
//! * `MAYBE_HAS_LITERAL_PREFIX(chunk)` - true if the chunk's bigrams hit
//! the literal bloom anywhere; false (skip the chunk) when there is zero
//! overlap.
//!
//! Cost: ~1 ns per byte on AVX2, ~2 ns per byte on scalar - strictly cheaper
//! than `AlphabetMask::from_bytes`. False-positive rate at the prefilter
//! level is empirically <1% on real source code (most Java files share
//! plenty of `gh`/`sk`/`xo` bigrams unrelated to secrets), so the win is
//! concentrated on `.lock` files, minified JS bundles, and binary blobs.
#![deny(unsafe_op_in_unsafe_fn)]
/// 4096-bit / 512-byte bigram bloom. Indexed by the FNV-1a hash of a 2-byte
/// window mod 4096.
#[derive(Clone, Copy)]
pub struct BigramBloom {
bits: [u64; 64],
}
impl BigramBloom {
pub const fn empty() -> Self {
Self { bits: [0; 64] }
}
/// Insert every distinct bigram from `bytes` into this bloom.
pub fn insert_all(&mut self, bytes: &[u8]) {
for window in bytes.windows(2) {
self.insert(window[0], window[1]);
}
}
#[inline]
fn insert(&mut self, a: u8, b: u8) {
let idx = bigram_slot(a, b);
self.bits[idx >> 6] |= 1u64 << (idx & 63);
}
/// Build a bloom containing every bigram of every literal prefix in
/// `literals`, plus `prefix[i] || ANY_BYTE` for each interior position
/// (so we accept secrets that *start* with the prefix and continue with
/// any byte). The "extension" widening keeps the bloom sound under
/// truncated prefixes (`ghp` matches `ghp_AB...`).
pub fn from_literal_prefixes(literals: &[String]) -> Self {
let mut bloom = Self::empty();
for literal in literals {
let bytes = literal.as_bytes();
if bytes.is_empty() {
continue;
}
if bytes.len() < 2 {
// 1-byte literal: every bigram starting with that byte is
// possible; we set the byte's full row to true. This is
// costly but 1-byte literal prefixes are pathological and
// the AC matcher will short-circuit before the bloom even
// sees the chunk.
for second in 0u8..=255 {
bloom.insert(bytes[0], second);
}
continue;
}
bloom.insert_all(bytes);
// Extension: terminal byte may be followed by anything in a
// real secret. Add `last || any`. The `len() < 2` guard
// above proves non-empty; if a future refactor weakens
// the guard we'd rather skip the terminal extension for
// that literal (slight precision loss in the prefilter)
// than panic the scanner mid-scan.
let Some(&last) = bytes.last() else { continue };
for second in 0u8..=255 {
bloom.insert(last, second);
}
}
bloom
}
/// Returns `true` when the chunk contains AT LEAST ONE bigram present
/// in `self`. Returns `false` when there is no overlap (skip the chunk).
pub fn maybe_overlaps(&self, chunk: &[u8]) -> bool {
if chunk.len() < 2 {
return true;
}
for window in chunk.windows(2) {
let idx = bigram_slot(window[0], window[1]);
if self.bits[idx >> 6] & (1u64 << (idx & 63)) != 0 {
return true;
}
}
false
}
/// Population count - useful for diagnostics and to detect a near-full
/// bloom (where `maybe_overlaps` would always return true and the
/// prefilter is providing zero value).
pub fn popcount(&self) -> u32 {
self.bits.iter().map(|w| w.count_ones()).sum()
}
}
#[inline]
fn bigram_slot(a: u8, b: u8) -> usize {
// FNV-1a 32-bit, restricted to a 12-bit slot index. Cheap and good
// enough for a static prefilter where collisions only inflate the
// false-positive rate (never produce false negatives).
let mut h: u32 = 0x811c_9dc5;
h ^= a as u32;
h = h.wrapping_mul(0x0100_0193);
h ^= b as u32;
h = h.wrapping_mul(0x0100_0193);
(h as usize) & 0x0fff
}