keyhog_scanner/probabilistic_gate.rs
1//! Fast probabilistic gating to reject obvious non-secrets before heavy ML scoring.
2//!
3//! Uses character diversity and simple bigram analysis to identify high-entropy noise
4//! like UUIDs, hashes, and base64-encoded binary that doesn't look like a secret.
5
6/// A tiny statistical gate for fast candidate rejection.
7pub struct ProbabilisticGate;
8
9impl ProbabilisticGate {
10 /// Returns true if the candidate string looks like a potential secret.
11 /// Returns false if it's almost certainly noise (UUID, hash, etc).
12 pub fn looks_promising(s: &str) -> bool {
13 if s.len() < 16 {
14 return true; // Too short for reliable gating
15 }
16
17 let mut count = 0;
18 let mut seen = [false; 256];
19 for b in s.bytes() {
20 if !seen[b as usize] {
21 seen[b as usize] = true;
22 count += 1;
23 if count >= 5 {
24 break;
25 }
26 }
27 }
28
29 // UUID detection: exactly 4 dashes in 8-4-4-4-12 hex pattern.
30 // Allocation-free, UTF-8 iteration-free optimized byte scanner.
31 if s.len() >= 32 && s.len() <= 40 {
32 let bytes = s.as_bytes();
33 let dash_count = bytes.iter().filter(|&&b| b == b'-').count();
34 if dash_count == 4 {
35 let mut valid = true;
36 let mut current_len = 0;
37 let mut part_count = 0;
38 for &b in bytes {
39 if b == b'-' {
40 if current_len == 0 {
41 valid = false;
42 break;
43 }
44 current_len = 0;
45 part_count += 1;
46 } else if b.is_ascii_hexdigit() {
47 current_len += 1;
48 } else {
49 valid = false;
50 break;
51 }
52 }
53 if valid && current_len > 0 {
54 part_count += 1;
55 }
56 if valid && part_count == 5 {
57 return false;
58 }
59 }
60 }
61
62 // Extremely low diversity (e.g. "aaaaaaaaaaaaaaaa") is rejected
63 if count < 5 {
64 return false;
65 }
66
67 // Lightweight approximation of a full bigram frequency table.
68 //
69 // Real secrets are alphabet-restricted but bigram-distributed: a
70 // base64 token has roughly uniform bigram frequencies, while a SHA
71 // hex digest has STRONGLY skewed frequencies (only 16 chars × 16
72 // bigrams = 256 possible bigrams, so any 32-char hex string visits
73 // ~31 of those 256). UUIDs without dashes are an extreme case.
74 //
75 // The cheap proxy: count distinct bigrams in s and require a
76 // minimum density. For a length-N candidate with K distinct chars,
77 // a uniform-random base64 string visits ~min(N-1, K^2)
78 // distinct bigrams; a hex string maxes out at 256 regardless of
79 // length. We require distinct_bigrams >= length / 4 (very lax) AND
80 // distinct_bigrams >= 8 (absolute floor for short candidates).
81 // These bounds reject 32-hex SHAs (which have ~28 distinct bigrams
82 // on 32 chars) very rarely - they pass - while killing pure-base64
83 // UUID-without-dashes pads.
84 //
85 // We compute distinct bigrams via a 64-byte (512-bit) bitset over
86 // a 9-bit FNV slot, identical to the bigram_bloom strategy.
87 let bytes = s.as_bytes();
88 if bytes.len() >= 32 {
89 let mut bigram_seen = [0u64; 8]; // 512 bits ≈ 0.6% FP at 28 bigrams
90 for window in bytes.windows(2) {
91 let h = bigram_slot_512(window[0], window[1]);
92 bigram_seen[h >> 6] |= 1u64 << (h & 63);
93 }
94 let distinct: u32 = bigram_seen.iter().map(|w| w.count_ones()).sum();
95 let length_floor = (bytes.len() / 4) as u32;
96 if distinct < 8 || distinct < length_floor {
97 return false;
98 }
99 }
100
101 true
102 }
103}
104
105#[inline]
106fn bigram_slot_512(a: u8, b: u8) -> usize {
107 let mut h: u32 = 0x811c_9dc5;
108 h ^= a as u32;
109 h = h.wrapping_mul(0x0100_0193);
110 h ^= b as u32;
111 h = h.wrapping_mul(0x0100_0193);
112 (h as usize) & 0x01ff
113}