Skip to main content

keyhog_scanner/
probabilistic_gate.rs

1//! Fast probabilistic gating to reject obvious non-secrets before heavy ML scoring.
2//!
3//! Uses character diversity and simple bigram analysis to identify high-entropy noise
4//! like UUIDs, hashes, and base64-encoded binary that doesn't look like a secret.
5
6/// A tiny statistical gate for fast candidate rejection.
7pub struct ProbabilisticGate;
8
9impl ProbabilisticGate {
10    /// Returns true if the candidate string looks like a potential secret.
11    /// Returns false if it's almost certainly noise (UUID, hash, etc).
12    pub fn looks_promising(s: &str) -> bool {
13        if s.len() < 16 {
14            return true; // Too short for reliable gating
15        }
16
17        let mut count = 0;
18        let mut seen = [false; 256];
19        for b in s.bytes() {
20            if !seen[b as usize] {
21                seen[b as usize] = true;
22                count += 1;
23                if count >= 5 {
24                    break;
25                }
26            }
27        }
28
29        // UUID detection: exactly 4 dashes in 8-4-4-4-12 hex pattern.
30        // Allocation-free, UTF-8 iteration-free optimized byte scanner.
31        if s.len() >= 32 && s.len() <= 40 {
32            let bytes = s.as_bytes();
33            let dash_count = bytes.iter().filter(|&&b| b == b'-').count();
34            if dash_count == 4 {
35                let mut valid = true;
36                let mut current_len = 0;
37                let mut part_count = 0;
38                for &b in bytes {
39                    if b == b'-' {
40                        if current_len == 0 {
41                            valid = false;
42                            break;
43                        }
44                        current_len = 0;
45                        part_count += 1;
46                    } else if b.is_ascii_hexdigit() {
47                        current_len += 1;
48                    } else {
49                        valid = false;
50                        break;
51                    }
52                }
53                if valid && current_len > 0 {
54                    part_count += 1;
55                }
56                if valid && part_count == 5 {
57                    return false;
58                }
59            }
60        }
61
62        // Extremely low diversity (e.g. "aaaaaaaaaaaaaaaa") is rejected
63        if count < 5 {
64            return false;
65        }
66
67        // Lightweight approximation of a full bigram frequency table.
68        //
69        // Real secrets are alphabet-restricted but bigram-distributed: a
70        // base64 token has roughly uniform bigram frequencies, while a SHA
71        // hex digest has STRONGLY skewed frequencies (only 16 chars × 16
72        // bigrams = 256 possible bigrams, so any 32-char hex string visits
73        // ~31 of those 256). UUIDs without dashes are an extreme case.
74        //
75        // The cheap proxy: count distinct bigrams in s and require a
76        // minimum density. For a length-N candidate with K distinct chars,
77        // a uniform-random base64 string visits ~min(N-1, K^2)
78        // distinct bigrams; a hex string maxes out at 256 regardless of
79        // length. We require distinct_bigrams >= length / 4 (very lax) AND
80        // distinct_bigrams >= 8 (absolute floor for short candidates).
81        // These bounds reject 32-hex SHAs (which have ~28 distinct bigrams
82        // on 32 chars) very rarely - they pass - while killing pure-base64
83        // UUID-without-dashes pads.
84        //
85        // We compute distinct bigrams via a 64-byte (512-bit) bitset over
86        // a 9-bit FNV slot, identical to the bigram_bloom strategy.
87        let bytes = s.as_bytes();
88        if bytes.len() >= 32 {
89            let mut bigram_seen = [0u64; 8]; // 512 bits ≈ 0.6% FP at 28 bigrams
90            for window in bytes.windows(2) {
91                let h = bigram_slot_512(window[0], window[1]);
92                bigram_seen[h >> 6] |= 1u64 << (h & 63);
93            }
94            let distinct: u32 = bigram_seen.iter().map(|w| w.count_ones()).sum();
95            let length_floor = (bytes.len() / 4) as u32;
96            if distinct < 8 || distinct < length_floor {
97                return false;
98            }
99        }
100
101        true
102    }
103}
104
105#[inline]
106fn bigram_slot_512(a: u8, b: u8) -> usize {
107    let mut h: u32 = 0x811c_9dc5;
108    h ^= a as u32;
109    h = h.wrapping_mul(0x0100_0193);
110    h ^= b as u32;
111    h = h.wrapping_mul(0x0100_0193);
112    (h as usize) & 0x01ff
113}