1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
//! Fast probabilistic gating to reject obvious non-secrets before heavy ML scoring.
//!
//! Uses character diversity and simple bigram analysis to identify high-entropy noise
//! like UUIDs, hashes, and base64-encoded binary that doesn't look like a secret.
/// A tiny statistical gate for fast candidate rejection.
pub struct ProbabilisticGate;
impl ProbabilisticGate {
/// Returns true if the candidate string looks like a potential secret.
/// Returns false if it's almost certainly noise (UUID, hash, etc).
pub fn looks_promising(s: &str) -> bool {
if s.len() < 16 {
return true; // Too short for reliable gating
}
let mut count = 0;
let mut seen = [false; 256];
for b in s.bytes() {
if !seen[b as usize] {
seen[b as usize] = true;
count += 1;
if count >= 5 {
break;
}
}
}
// UUID detection: exactly 4 dashes in 8-4-4-4-12 hex pattern.
// Allocation-free, UTF-8 iteration-free optimized byte scanner.
if s.len() >= 32 && s.len() <= 40 {
let bytes = s.as_bytes();
let dash_count = bytes.iter().filter(|&&b| b == b'-').count();
if dash_count == 4 {
let mut valid = true;
let mut current_len = 0;
let mut part_count = 0;
for &b in bytes {
if b == b'-' {
if current_len == 0 {
valid = false;
break;
}
current_len = 0;
part_count += 1;
} else if b.is_ascii_hexdigit() {
current_len += 1;
} else {
valid = false;
break;
}
}
if valid && current_len > 0 {
part_count += 1;
}
if valid && part_count == 5 {
return false;
}
}
}
// Extremely low diversity (e.g. "aaaaaaaaaaaaaaaa") is rejected
if count < 5 {
return false;
}
// Lightweight approximation of a full bigram frequency table.
//
// Real secrets are alphabet-restricted but bigram-distributed: a
// base64 token has roughly uniform bigram frequencies, while a SHA
// hex digest has STRONGLY skewed frequencies (only 16 chars × 16
// bigrams = 256 possible bigrams, so any 32-char hex string visits
// ~31 of those 256). UUIDs without dashes are an extreme case.
//
// The cheap proxy: count distinct bigrams in s and require a
// minimum density. For a length-N candidate with K distinct chars,
// a uniform-random base64 string visits ~min(N-1, K^2)
// distinct bigrams; a hex string maxes out at 256 regardless of
// length. We require distinct_bigrams >= length / 4 (very lax) AND
// distinct_bigrams >= 8 (absolute floor for short candidates).
// These bounds reject 32-hex SHAs (which have ~28 distinct bigrams
// on 32 chars) very rarely - they pass - while killing pure-base64
// UUID-without-dashes pads.
//
// We compute distinct bigrams via a 64-byte (512-bit) bitset over
// a 9-bit FNV slot, identical to the bigram_bloom strategy.
let bytes = s.as_bytes();
if bytes.len() >= 32 {
let mut bigram_seen = [0u64; 8]; // 512 bits ≈ 0.6% FP at 28 bigrams
for window in bytes.windows(2) {
let h = bigram_slot_512(window[0], window[1]);
bigram_seen[h >> 6] |= 1u64 << (h & 63);
}
let distinct: u32 = bigram_seen.iter().map(|w| w.count_ones()).sum();
let length_floor = (bytes.len() / 4) as u32;
if distinct < 8 || distinct < length_floor {
return false;
}
}
true
}
}
#[inline]
fn bigram_slot_512(a: u8, b: u8) -> usize {
let mut h: u32 = 0x811c_9dc5;
h ^= a as u32;
h = h.wrapping_mul(0x0100_0193);
h ^= b as u32;
h = h.wrapping_mul(0x0100_0193);
(h as usize) & 0x01ff
}