keyhog_scanner/decode_structure.rs
1//! Decode-structure analysis: keyhog's decode-through advantage, fed into
2//! scoring.
3//!
4//! A generic high-entropy candidate (caught by `generic-secret`,
5//! `generic-password`, `entropy-*`) is ambiguous on its surface: a real
6//! base64/hex secret and a base64-wrapped *binary asset* (a PNG, a gzip blob,
7//! a serialized protobuf, an embedded cert) look identical to an
8//! entropy/regex/token-efficiency filter. The distinguishing signal is what
9//! the candidate *decodes to* - and keyhog already decodes. This module turns
10//! the decoded bytes into a verdict the confidence pipeline (and, later, the ML
11//! feature vector) can use.
12//!
13//! The verdict is built only on **definitional** signals, so it never
14//! false-suppresses a real credential:
15//! * **Magic bytes.** A blob that decodes to a PNG/JPEG/GIF/gzip/zip/PDF/ELF/
16//! Mach-O/PE/zstd/xz/bzip2/7z/SQLite/Java-class header IS that format. Over
17//! 3000 random 24-48 byte secrets, ZERO carry any of these headers at
18//! offset 0 (they are 4-8 specific bytes out of 256^k).
19//! * **Full protobuf-wire parse.** Bytes that parse end-to-end as a protobuf
20//! wire stream (valid field tags, valid wire types, length-delimited fields
21//! that stay in bounds, whole buffer consumed) with several fields are a
22//! serialized message. Random bytes parse this way <0.5% of the time, and
23//! we additionally require >= 3 fields and >= 8 bytes.
24//!
25//! Printable-ratio is recorded for the future ML feature but is NOT used in the
26//! boolean verdict: random secret bytes and binary blobs both sit around 37-50%
27//! printable, so it is too weak to gate suppression on its own.
28//!
29//! Tests live in `tests/unit/decode_structure*.rs` (Santh no-inline-tests
30//! contract).
31
32use base64::Engine;
33
34/// Structured view of what a candidate decodes to. Carried as-is into the ML
35/// feature vector once the model is retrained; consumed today by
36/// [`is_encoded_binary`].
37#[derive(Debug, Clone, Default, PartialEq)]
38pub struct DecodeStructure {
39 /// The candidate is a syntactically valid base64 (standard or url-safe) or
40 /// hex string of a length worth decoding.
41 pub decodable: bool,
42 /// Number of bytes the candidate decoded to (0 when not decodable).
43 pub decoded_len: usize,
44 /// Fraction of decoded bytes that are printable ASCII (incl. tab/newline).
45 pub printable_ratio: f32,
46 /// Identified container/format from the decoded magic bytes, if any.
47 pub magic: Option<&'static str>,
48 /// The decoded bytes parse end-to-end as a multi-field protobuf wire stream.
49 pub protobuf_wire: bool,
50}
51
52impl DecodeStructure {
53 /// True when the decoded bytes are an identifiable binary asset or a
54 /// serialized protobuf message - i.e. data, not a credential.
55 #[must_use]
56 pub fn is_binary_payload(&self) -> bool {
57 self.magic.is_some() || (self.protobuf_wire && self.decoded_len >= 8)
58 }
59}
60
61/// Minimum candidate length before we bother decoding. A base64 blob needs
62/// >= 8 chars to carry a 4-byte magic header, and short tokens are the job of
63/// the named detectors anyway.
64const MIN_DECODE_LEN: usize = 16;
65
66/// Conservative verdict for the confidence pipeline: does this generic
67/// candidate decode to identifiable binary / serialized data? Real secrets
68/// return `false`.
69///
70/// Memoized: a single match is scored on this twice (ML feature #41 in
71/// `ml_features` and the generic-detector confidence penalty in
72/// `confidence::penalties`), and a scan re-encounters the same token across
73/// chunks. Without the cache every call re-decodes and re-parses the bytes.
74/// Thread-local + bounded with wholesale eviction, mirroring
75/// `entropy::shannon_entropy`. The verdict is a pure function of `candidate`,
76/// so caching by content hash is always correct.
77#[must_use]
78pub fn is_encoded_binary(candidate: &str) -> bool {
79 use std::cell::RefCell;
80 use std::collections::HashMap;
81
82 const MAX_CACHE_ENTRIES: usize = 4096;
83
84 thread_local! {
85 static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
86 }
87
88 // FNV-1a over the candidate bytes - the same hash the entropy / ML-score
89 // caches key on.
90 let mut hash: u64 = 0xcbf29ce484222325;
91 for &byte in candidate.as_bytes() {
92 hash ^= u64::from(byte);
93 hash = hash.wrapping_mul(0x100000001b3);
94 }
95
96 CACHE.with(|cache| {
97 if let Some(&verdict) = cache.borrow().get(&hash) {
98 return verdict;
99 }
100 let verdict = analyze(candidate).is_binary_payload();
101 let mut cache = cache.borrow_mut();
102 if cache.len() >= MAX_CACHE_ENTRIES {
103 cache.clear();
104 }
105 cache.insert(hash, verdict);
106 verdict
107 })
108}
109
110/// Placeholder words that mark a credential as a documentation sample, not a
111/// real secret. The single source of truth for the lowercase byte-slice
112/// placeholder set: consumed for the SURFACE form by
113/// `confidence::penalties::contains_placeholder_word` and for the BASE64 / HEX
114/// decoded form by this module's [`decoded_contains_placeholder`] (so a
115/// base64-wrapped `AKIAEXAMPLEEXAMPLE12` = `QUtJQUVYQU1QTEVFWEFNUExFMTI=` is
116/// still caught).
117///
118/// Excludes ambiguous tokens by design: `test` (real Stripe `sk_test_` keys),
119/// `password` (connection strings `redis://user:password@host`), `admin` /
120/// `root` (legitimate credentials), `qwerty` (weak but real password).
121pub const PLACEHOLDER_WORDS: &[&[u8]] = &[
122 b"example",
123 b"dummy",
124 b"fake",
125 b"sample",
126 b"placeholder",
127 b"changeme",
128];
129
130/// Unified shape-only gate for the "uniform random base64 blob" class - the
131/// single parameterized definition behind every base64-protobuf-decoy gate in
132/// the scanner. Reconciles two previously-divergent copies (this module's
133/// penalty-path [`looks_like_uniform_base64_blob`] and the entropy-path's
134/// `engine::fallback_entropy_helpers::entropy_path_looks_like_random_base64_blob`)
135/// so their length/diversity bands are tuned in one place and can never drift
136/// in opposite directions un-benched again.
137///
138/// Returns true when `value`:
139/// 1. has length in `min_len..=max_len`, AND
140/// 2. is a multiple-of-4 length OR carries trailing `=` padding, AND
141/// 3. uses only the standard base64 alphabet (`A-Za-z0-9`, `=`, `+`, `/`) -
142/// any `-`/`_`/`.`/other char rejects, which clears base64url tokens
143/// (GitHub PATs, OAuth bearers), JWTs (`.`), and Slack (`-`), AND
144/// 4. satisfies an admit clause: contains `+`/`/` punctuation, OR has
145/// padding, OR (length is mult-of-4 AND alphabet diversity >=
146/// `min_diversity` distinct alphanumeric chars). The diversity admit
147/// catches pure-alphanumeric base64 (no `+/`) that random-byte encodings
148/// reach but placeholders / English words never do at the band floor.
149///
150/// `min_diversity == 0` disables the diversity admit (only punctuation /
151/// padding then qualify) - that is how a caller wanting the stricter
152/// "structural punctuation required" behavior (the entropy path's intent)
153/// opts out of the diversity wedge while still sharing this band + alphabet
154/// skeleton. The entropy path additionally requires BOTH `+` and `/`; it
155/// composes that tightening on top of this gate in its own wrapper (it owns
156/// that file boundary), calling here for the band + alphabet + padding
157/// skeleton.
158#[must_use]
159pub fn is_random_base64_blob(
160 value: &str,
161 min_len: usize,
162 max_len: usize,
163 min_diversity: u32,
164) -> bool {
165 if !(min_len..=max_len).contains(&value.len()) {
166 return false;
167 }
168 let has_padding = value.ends_with("==") || value.ends_with('=');
169 let length_mult_4 = value.len().is_multiple_of(4);
170 if !has_padding && !length_mult_4 {
171 return false;
172 }
173 let mut has_b64_punct = false;
174 let mut seen = [false; 256];
175 let mut distinct_alnum: u32 = 0;
176 for b in value.bytes() {
177 match b {
178 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' => {
179 if !seen[b as usize] {
180 seen[b as usize] = true;
181 distinct_alnum += 1;
182 }
183 }
184 b'=' => {}
185 b'+' | b'/' => has_b64_punct = true,
186 _ => return false,
187 }
188 }
189 // Admit clauses:
190 // * +/ punctuation in standard base64 alphabet, OR
191 // * trailing `=` padding (length already validated as mult-of-4 path
192 // above), OR
193 // * length is mult-of-4 AND alphabet diversity >= `min_diversity`
194 // distinct alphanumeric chars (random bytes encoded; placeholders /
195 // words never reach this diversity at the band floor). A zero
196 // `min_diversity` disables this admit (punct / padding only).
197 has_b64_punct
198 || has_padding
199 || (min_diversity != 0 && length_mult_4 && distinct_alnum >= min_diversity)
200}
201
202/// Shape-only check: does `value` look like a uniform base64 blob with no
203/// structure markers? Thin wrapper over [`is_random_base64_blob`] with the
204/// penalty-path band (44..=600) and diversity floor (32). Matches the
205/// `random-base64-protobuf` corpus shape (random bytes base64-encoded into a
206/// `password=`/`secret=` slot) without firing on real service-anchored
207/// credentials:
208/// * AWS secret access keys (40 base62 chars, no +/, no padding) - too short
209/// * GitHub PATs (40+ chars but contain `_`) - skipped (alphabet check)
210/// * npm tokens (36 chars base62) - too short, skipped
211/// * Stripe keys (32 chars, `sk_`/`pk_` prefix with `_`) - skipped
212/// * Slack tokens (xox*-prefixed with `-`) - skipped
213/// * JWT tokens (`.` separators) - skipped
214/// * OAuth bearer tokens with `-`/`_` (base64url) - skipped via alphabet
215///
216/// Used by `confidence::penalties::apply_post_ml_penalties` as the generic-
217/// detector branch's "this is a random base64 blob, not a credential" gate.
218/// Mirror v27 had 56 base64-protobuf FPs surviving every other suppression;
219/// this is the dedicated gate for that class. v33 widened the floor from
220/// 60 to 44 and added a high-diversity admit so pure-alphanumeric base64
221/// (lacking +/) is also slammed - 14+ FPs in the corpus relied on the
222/// gap.
223#[must_use]
224pub fn looks_like_uniform_base64_blob(value: &str) -> bool {
225 is_random_base64_blob(value, 44, 600, 32)
226}
227
228/// True when `value` base64-decodes to bytes that are themselves all in
229/// the base64 alphabet (double-encoded base64). k8s `data:` fields wrap
230/// their values in another base64 layer; the inner decoded bytes are the
231/// actual user content, and when those bytes are themselves a printable
232/// base64 blob the outer wrapper is categorically data, not a credential.
233///
234/// Conservative: requires the decoded length to be >= 32 chars AND the
235/// decoded bytes to be all standard-base64 alphabet (A-Za-z0-9+/=).
236/// Random secret bytes would produce non-base64 bytes (non-printable,
237/// 0x00..0x20, 0x80..0xFF) so this is definitional, not heuristic.
238///
239/// Memoized via the same FNV-1a hash + thread-local cache pattern as the
240/// other decode-through helpers.
241#[must_use]
242pub fn decoded_is_base64_blob(candidate: &str) -> bool {
243 use std::cell::RefCell;
244 use std::collections::HashMap;
245
246 const MAX_CACHE_ENTRIES: usize = 4096;
247
248 thread_local! {
249 static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
250 }
251
252 let mut hash: u64 = 0xcbf29ce484222325;
253 for &byte in candidate.as_bytes() {
254 hash ^= u64::from(byte);
255 hash = hash.wrapping_mul(0x100000001b3);
256 }
257
258 CACHE.with(|cache| {
259 if let Some(&verdict) = cache.borrow().get(&hash) {
260 return verdict;
261 }
262 let verdict = compute_decoded_is_base64_blob(candidate);
263 let mut cache = cache.borrow_mut();
264 if cache.len() >= MAX_CACHE_ENTRIES {
265 cache.clear();
266 }
267 cache.insert(hash, verdict);
268 verdict
269 })
270}
271
272fn compute_decoded_is_base64_blob(candidate: &str) -> bool {
273 let trimmed = candidate.trim();
274 if trimmed.len() < MIN_DECODE_LEN {
275 return false;
276 }
277 let Some(bytes) = decode_candidate(trimmed) else {
278 return false;
279 };
280 if bytes.len() < 32 {
281 return false;
282 }
283 bytes
284 .iter()
285 .all(|&b| b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'='))
286}
287
288/// Decode `candidate` (base64 / url-safe-base64 / hex) and check whether the
289/// decoded bytes contain any placeholder word case-insensitively. Composes
290/// keyhog's decode-through with the placeholder suppression: a docs sample
291/// that arrives base64-wrapped (e.g. AWS docs publishing AKIAEXAMPLEEXAMPLE12
292/// as the base64-encoded body of a yaml secret) is now recognized as a sample
293/// even though the surface form looks like high-entropy random bytes. Mirror
294/// v26: 9 docs-example-marker FPs (all `QUtJQUVYQU1QTEVFWEFNUExFMTI=`, base64
295/// of AKIA...EXAMPLE...12) collapsed by this gate. Memoized to match the
296/// existing `is_encoded_binary` call cadence.
297#[must_use]
298pub fn decoded_contains_placeholder(candidate: &str) -> bool {
299 use std::cell::RefCell;
300 use std::collections::HashMap;
301
302 const MAX_CACHE_ENTRIES: usize = 4096;
303
304 thread_local! {
305 static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
306 }
307
308 // FNV-1a over the candidate bytes - keyed identically to is_encoded_binary
309 // so the two caches cost a single hash per credential.
310 let mut hash: u64 = 0xcbf29ce484222325;
311 for &byte in candidate.as_bytes() {
312 hash ^= u64::from(byte);
313 hash = hash.wrapping_mul(0x100000001b3);
314 }
315
316 CACHE.with(|cache| {
317 if let Some(&verdict) = cache.borrow().get(&hash) {
318 return verdict;
319 }
320 let verdict = compute_decoded_contains_placeholder(candidate);
321 let mut cache = cache.borrow_mut();
322 if cache.len() >= MAX_CACHE_ENTRIES {
323 cache.clear();
324 }
325 cache.insert(hash, verdict);
326 verdict
327 })
328}
329
330fn compute_decoded_contains_placeholder(candidate: &str) -> bool {
331 let trimmed = candidate.trim();
332 if trimmed.len() < MIN_DECODE_LEN {
333 return false;
334 }
335 let Some(bytes) = decode_candidate(trimmed) else {
336 return false;
337 };
338 if bytes.is_empty() {
339 return false;
340 }
341 PLACEHOLDER_WORDS.iter().any(|word| {
342 bytes
343 .windows(word.len())
344 .any(|window| window.eq_ignore_ascii_case(word))
345 })
346}
347
348/// Decode `candidate` (base64 standard, base64 url-safe, or hex) and describe
349/// the resulting bytes. Returns a default (non-decodable) structure when the
350/// candidate is too short or not a clean encoding.
351#[must_use]
352pub fn analyze(candidate: &str) -> DecodeStructure {
353 let trimmed = candidate.trim();
354 if trimmed.len() < MIN_DECODE_LEN {
355 return DecodeStructure::default();
356 }
357 let Some(bytes) = decode_candidate(trimmed) else {
358 return DecodeStructure::default();
359 };
360 if bytes.is_empty() {
361 return DecodeStructure::default();
362 }
363 let printable = bytes
364 .iter()
365 .filter(|&&b| (32..127).contains(&b) || matches!(b, 9 | 10 | 13))
366 .count();
367 DecodeStructure {
368 decodable: true,
369 decoded_len: bytes.len(),
370 printable_ratio: printable as f32 / bytes.len() as f32,
371 magic: magic_format(&bytes),
372 protobuf_wire: parse_protobuf_wire(&bytes),
373 }
374}
375
376/// Decode the candidate as base64 (standard then url-safe, padded or not) or,
377/// failing that, as an even-length all-hex string. Only accepts clean,
378/// whole-string decodes so a stray match does not masquerade as binary.
379fn decode_candidate(s: &str) -> Option<Vec<u8>> {
380 // base64 alphabets are a superset of hex's, so try base64 first and only
381 // fall back to hex for strings that are NOT valid base64.
382 let looks_b64 = s
383 .bytes()
384 .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'-' | b'_' | b'='));
385 if looks_b64 {
386 // Pad to a multiple of 4 so unpadded blobs decode.
387 let mut padded = s.to_string();
388 let rem = padded.len() % 4;
389 if rem != 0 {
390 padded.push_str(&"=".repeat(4 - rem));
391 }
392 if let Ok(b) = base64::engine::general_purpose::STANDARD.decode(padded.as_bytes()) {
393 return Some(b);
394 }
395 if let Ok(b) = base64::engine::general_purpose::URL_SAFE.decode(padded.as_bytes()) {
396 return Some(b);
397 }
398 }
399 if s.len() >= MIN_DECODE_LEN && s.len() % 2 == 0 && s.bytes().all(|b| b.is_ascii_hexdigit()) {
400 let mut out = Vec::with_capacity(s.len() / 2);
401 let raw = s.as_bytes();
402 let mut i = 0;
403 while i + 1 < raw.len() {
404 let hi = (raw[i] as char).to_digit(16)?;
405 let lo = (raw[i + 1] as char).to_digit(16)?;
406 out.push(((hi << 4) | lo) as u8);
407 i += 2;
408 }
409 return Some(out);
410 }
411 None
412}
413
414/// Identify common binary container/asset formats by their leading magic
415/// bytes. These headers are definitional: a stream that starts with them IS
416/// that format, and no credential carries them.
417fn magic_format(b: &[u8]) -> Option<&'static str> {
418 const SIGS: &[(&[u8], &str)] = &[
419 (b"\x89PNG\r\n\x1a\n", "png"),
420 (b"\xff\xd8\xff", "jpeg"),
421 (b"GIF87a", "gif"),
422 (b"GIF89a", "gif"),
423 (b"\x1f\x8b", "gzip"),
424 (b"BZh", "bzip2"),
425 (b"\xfd7zXZ\x00", "xz"),
426 (b"\x28\xb5\x2f\xfd", "zstd"),
427 (b"PK\x03\x04", "zip"),
428 (b"PK\x05\x06", "zip"),
429 (b"7z\xbc\xaf\x27\x1c", "7z"),
430 (b"Rar!\x1a\x07", "rar"),
431 (b"%PDF-", "pdf"),
432 (b"\x7fELF", "elf"),
433 (b"\xfe\xed\xfa\xce", "mach-o"),
434 (b"\xfe\xed\xfa\xcf", "mach-o"),
435 (b"\xcf\xfa\xed\xfe", "mach-o"),
436 (b"\xca\xfe\xba\xbe", "java-class"),
437 (b"MZ", "pe"),
438 (b"SQLite format 3\x00", "sqlite"),
439 (b"OggS", "ogg"),
440 (b"RIFF", "riff"),
441 (b"\x00\x61\x73\x6d", "wasm"),
442 // zlib streams: 0x78 followed by a valid FLEVEL byte.
443 (b"\x78\x01", "zlib"),
444 (b"\x78\x9c", "zlib"),
445 (b"\x78\xda", "zlib"),
446 (b"\x78\x5e", "zlib"),
447 ];
448 SIGS.iter()
449 .find(|(sig, _)| b.starts_with(sig))
450 .map(|(_, name)| *name)
451}
452
453/// Parse `data` as a protobuf wire stream. Returns true only when the entire
454/// buffer is consumed by >= 3 valid (tag, value) fields with valid wire types -
455/// the profile of a real serialized message, which random bytes hit < 0.5% of
456/// the time.
457fn parse_protobuf_wire(data: &[u8]) -> bool {
458 let n = data.len();
459 if n < 8 {
460 return false;
461 }
462 let mut i = 0usize;
463 let mut fields = 0u32;
464 while i < n {
465 let Some((tag, next)) = read_varint(data, i) else {
466 return false;
467 };
468 i = next;
469 let wire = tag & 0x07;
470 let field_no = tag >> 3;
471 if field_no == 0 {
472 return false;
473 }
474 match wire {
475 0 => {
476 // varint value
477 let Some((_, next)) = read_varint(data, i) else {
478 return false;
479 };
480 i = next;
481 }
482 1 => {
483 // 64-bit fixed
484 match i.checked_add(8) {
485 Some(x) if x <= n => i = x,
486 _ => return false,
487 }
488 }
489 2 => {
490 // length-delimited
491 let Some((len, next)) = read_varint(data, i) else {
492 return false;
493 };
494 i = match next.checked_add(len as usize) {
495 Some(x) if x <= n => x,
496 _ => return false,
497 };
498 }
499 5 => {
500 // 32-bit fixed
501 match i.checked_add(4) {
502 Some(x) if x <= n => i = x,
503 _ => return false,
504 }
505 }
506 _ => return false, // 3,4 (groups, deprecated) and 6,7 (invalid)
507 }
508 fields += 1;
509 }
510 i == n && fields >= 3
511}
512
513/// Read a base-128 varint at `data[start..]`, returning (value, next_index).
514fn read_varint(data: &[u8], start: usize) -> Option<(u64, usize)> {
515 let mut value: u64 = 0;
516 let mut shift = 0u32;
517 let mut i = start;
518 loop {
519 let b = *data.get(i)?;
520 i += 1;
521 value |= u64::from(b & 0x7F) << shift;
522 if b & 0x80 == 0 {
523 return Some((value, i));
524 }
525 shift += 7;
526 if shift > 63 {
527 return None;
528 }
529 }
530}