Skip to main content

keyhog_scanner/
decode_structure.rs

1//! Decode-structure analysis: keyhog's decode-through advantage, fed into
2//! scoring.
3//!
4//! A generic high-entropy candidate (caught by `generic-secret`,
5//! `generic-password`, `entropy-*`) is ambiguous on its surface: a real
6//! base64/hex secret and a base64-wrapped *binary asset* (a PNG, a gzip blob,
7//! a serialized protobuf, an embedded cert) look identical to an
8//! entropy/regex/token-efficiency filter. The distinguishing signal is what
9//! the candidate *decodes to* - and keyhog already decodes. This module turns
10//! the decoded bytes into a verdict the confidence pipeline (and, later, the ML
11//! feature vector) can use.
12//!
13//! The verdict is built only on **definitional** signals, so it never
14//! false-suppresses a real credential:
15//!   * **Magic bytes.** A blob that decodes to a PNG/JPEG/GIF/gzip/zip/PDF/ELF/
16//!     Mach-O/PE/zstd/xz/bzip2/7z/SQLite/Java-class header IS that format. Over
17//!     3000 random 24-48 byte secrets, ZERO carry any of these headers at
18//!     offset 0 (they are 4-8 specific bytes out of 256^k).
19//!   * **Full protobuf-wire parse.** Bytes that parse end-to-end as a protobuf
20//!     wire stream (valid field tags, valid wire types, length-delimited fields
21//!     that stay in bounds, whole buffer consumed) with several fields are a
22//!     serialized message. Random bytes parse this way <0.5% of the time, and
23//!     we additionally require >= 3 fields and >= 8 bytes.
24//!
25//! Printable-ratio is recorded for the future ML feature but is NOT used in the
26//! boolean verdict: random secret bytes and binary blobs both sit around 37-50%
27//! printable, so it is too weak to gate suppression on its own.
28//!
29//! Tests live in `tests/unit/decode_structure*.rs` (Santh no-inline-tests
30//! contract).
31
32use base64::Engine;
33
34/// Structured view of what a candidate decodes to. Carried as-is into the ML
35/// feature vector once the model is retrained; consumed today by
36/// [`is_encoded_binary`].
37#[derive(Debug, Clone, Default, PartialEq)]
38pub struct DecodeStructure {
39    /// The candidate is a syntactically valid base64 (standard or url-safe) or
40    /// hex string of a length worth decoding.
41    pub decodable: bool,
42    /// Number of bytes the candidate decoded to (0 when not decodable).
43    pub decoded_len: usize,
44    /// Fraction of decoded bytes that are printable ASCII (incl. tab/newline).
45    pub printable_ratio: f32,
46    /// Identified container/format from the decoded magic bytes, if any.
47    pub magic: Option<&'static str>,
48    /// The decoded bytes parse end-to-end as a multi-field protobuf wire stream.
49    pub protobuf_wire: bool,
50}
51
52impl DecodeStructure {
53    /// True when the decoded bytes are an identifiable binary asset or a
54    /// serialized protobuf message - i.e. data, not a credential.
55    #[must_use]
56    pub fn is_binary_payload(&self) -> bool {
57        self.magic.is_some() || (self.protobuf_wire && self.decoded_len >= 8)
58    }
59}
60
61/// Minimum candidate length before we bother decoding. A base64 blob needs
62/// >= 8 chars to carry a 4-byte magic header, and short tokens are the job of
63/// the named detectors anyway.
64const MIN_DECODE_LEN: usize = 16;
65
66/// Conservative verdict for the confidence pipeline: does this generic
67/// candidate decode to identifiable binary / serialized data? Real secrets
68/// return `false`.
69///
70/// Memoized: a single match is scored on this twice (ML feature #41 in
71/// `ml_features` and the generic-detector confidence penalty in
72/// `confidence::penalties`), and a scan re-encounters the same token across
73/// chunks. Without the cache every call re-decodes and re-parses the bytes.
74/// Thread-local + bounded with wholesale eviction, mirroring
75/// `entropy::shannon_entropy`. The verdict is a pure function of `candidate`,
76/// so caching by content hash is always correct.
77#[must_use]
78pub fn is_encoded_binary(candidate: &str) -> bool {
79    use std::cell::RefCell;
80    use std::collections::HashMap;
81
82    const MAX_CACHE_ENTRIES: usize = 4096;
83
84    thread_local! {
85        static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
86    }
87
88    // FNV-1a over the candidate bytes - the same hash the entropy / ML-score
89    // caches key on.
90    let mut hash: u64 = 0xcbf29ce484222325;
91    for &byte in candidate.as_bytes() {
92        hash ^= u64::from(byte);
93        hash = hash.wrapping_mul(0x100000001b3);
94    }
95
96    CACHE.with(|cache| {
97        if let Some(&verdict) = cache.borrow().get(&hash) {
98            return verdict;
99        }
100        let verdict = analyze(candidate).is_binary_payload();
101        let mut cache = cache.borrow_mut();
102        if cache.len() >= MAX_CACHE_ENTRIES {
103            cache.clear();
104        }
105        cache.insert(hash, verdict);
106        verdict
107    })
108}
109
110/// Placeholder words that mark a credential as a documentation sample, not a
111/// real secret. The single source of truth for the lowercase byte-slice
112/// placeholder set: consumed for the SURFACE form by
113/// `confidence::penalties::contains_placeholder_word` and for the BASE64 / HEX
114/// decoded form by this module's [`decoded_contains_placeholder`] (so a
115/// base64-wrapped `AKIAEXAMPLEEXAMPLE12` = `QUtJQUVYQU1QTEVFWEFNUExFMTI=` is
116/// still caught).
117///
118/// Excludes ambiguous tokens by design: `test` (real Stripe `sk_test_` keys),
119/// `password` (connection strings `redis://user:password@host`), `admin` /
120/// `root` (legitimate credentials), `qwerty` (weak but real password).
121pub const PLACEHOLDER_WORDS: &[&[u8]] = &[
122    b"example",
123    b"dummy",
124    b"fake",
125    b"sample",
126    b"placeholder",
127    b"changeme",
128];
129
130/// Unified shape-only gate for the "uniform random base64 blob" class - the
131/// single parameterized definition behind every base64-protobuf-decoy gate in
132/// the scanner. Reconciles two previously-divergent copies (this module's
133/// penalty-path [`looks_like_uniform_base64_blob`] and the entropy-path's
134/// `engine::fallback_entropy_helpers::entropy_path_looks_like_random_base64_blob`)
135/// so their length/diversity bands are tuned in one place and can never drift
136/// in opposite directions un-benched again.
137///
138/// Returns true when `value`:
139///   1. has length in `min_len..=max_len`, AND
140///   2. is a multiple-of-4 length OR carries trailing `=` padding, AND
141///   3. uses only the standard base64 alphabet (`A-Za-z0-9`, `=`, `+`, `/`) -
142///      any `-`/`_`/`.`/other char rejects, which clears base64url tokens
143///      (GitHub PATs, OAuth bearers), JWTs (`.`), and Slack (`-`), AND
144///   4. satisfies an admit clause: contains `+`/`/` punctuation, OR has
145///      padding, OR (length is mult-of-4 AND alphabet diversity >=
146///      `min_diversity` distinct alphanumeric chars). The diversity admit
147///      catches pure-alphanumeric base64 (no `+/`) that random-byte encodings
148///      reach but placeholders / English words never do at the band floor.
149///
150/// `min_diversity == 0` disables the diversity admit (only punctuation /
151/// padding then qualify) - that is how a caller wanting the stricter
152/// "structural punctuation required" behavior (the entropy path's intent)
153/// opts out of the diversity wedge while still sharing this band + alphabet
154/// skeleton. The entropy path additionally requires BOTH `+` and `/`; it
155/// composes that tightening on top of this gate in its own wrapper (it owns
156/// that file boundary), calling here for the band + alphabet + padding
157/// skeleton.
158#[must_use]
159pub fn is_random_base64_blob(
160    value: &str,
161    min_len: usize,
162    max_len: usize,
163    min_diversity: u32,
164) -> bool {
165    if !(min_len..=max_len).contains(&value.len()) {
166        return false;
167    }
168    let has_padding = value.ends_with("==") || value.ends_with('=');
169    let length_mult_4 = value.len().is_multiple_of(4);
170    if !has_padding && !length_mult_4 {
171        return false;
172    }
173    let mut has_b64_punct = false;
174    let mut seen = [false; 256];
175    let mut distinct_alnum: u32 = 0;
176    for b in value.bytes() {
177        match b {
178            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' => {
179                if !seen[b as usize] {
180                    seen[b as usize] = true;
181                    distinct_alnum += 1;
182                }
183            }
184            b'=' => {}
185            b'+' | b'/' => has_b64_punct = true,
186            _ => return false,
187        }
188    }
189    // Admit clauses:
190    //   * +/  punctuation in standard base64 alphabet, OR
191    //   * trailing `=` padding (length already validated as mult-of-4 path
192    //     above), OR
193    //   * length is mult-of-4 AND alphabet diversity >= `min_diversity`
194    //     distinct alphanumeric chars (random bytes encoded; placeholders /
195    //     words never reach this diversity at the band floor). A zero
196    //     `min_diversity` disables this admit (punct / padding only).
197    has_b64_punct
198        || has_padding
199        || (min_diversity != 0 && length_mult_4 && distinct_alnum >= min_diversity)
200}
201
202/// Shape-only check: does `value` look like a uniform base64 blob with no
203/// structure markers? Thin wrapper over [`is_random_base64_blob`] with the
204/// penalty-path band (44..=600) and diversity floor (32). Matches the
205/// `random-base64-protobuf` corpus shape (random bytes base64-encoded into a
206/// `password=`/`secret=` slot) without firing on real service-anchored
207/// credentials:
208///   * AWS secret access keys (40 base62 chars, no +/, no padding) - too short
209///   * GitHub PATs (40+ chars but contain `_`) - skipped (alphabet check)
210///   * npm tokens (36 chars base62) - too short, skipped
211///   * Stripe keys (32 chars, `sk_`/`pk_` prefix with `_`) - skipped
212///   * Slack tokens (xox*-prefixed with `-`) - skipped
213///   * JWT tokens (`.` separators) - skipped
214///   * OAuth bearer tokens with `-`/`_` (base64url) - skipped via alphabet
215///
216/// Used by `confidence::penalties::apply_post_ml_penalties` as the generic-
217/// detector branch's "this is a random base64 blob, not a credential" gate.
218/// Mirror v27 had 56 base64-protobuf FPs surviving every other suppression;
219/// this is the dedicated gate for that class. v33 widened the floor from
220/// 60 to 44 and added a high-diversity admit so pure-alphanumeric base64
221/// (lacking +/) is also slammed - 14+ FPs in the corpus relied on the
222/// gap.
223#[must_use]
224pub fn looks_like_uniform_base64_blob(value: &str) -> bool {
225    is_random_base64_blob(value, 44, 600, 32)
226}
227
228/// True when `value` base64-decodes to bytes that are themselves all in
229/// the base64 alphabet (double-encoded base64). k8s `data:` fields wrap
230/// their values in another base64 layer; the inner decoded bytes are the
231/// actual user content, and when those bytes are themselves a printable
232/// base64 blob the outer wrapper is categorically data, not a credential.
233///
234/// Conservative: requires the decoded length to be >= 32 chars AND the
235/// decoded bytes to be all standard-base64 alphabet (A-Za-z0-9+/=).
236/// Random secret bytes would produce non-base64 bytes (non-printable,
237/// 0x00..0x20, 0x80..0xFF) so this is definitional, not heuristic.
238///
239/// Memoized via the same FNV-1a hash + thread-local cache pattern as the
240/// other decode-through helpers.
241#[must_use]
242pub fn decoded_is_base64_blob(candidate: &str) -> bool {
243    use std::cell::RefCell;
244    use std::collections::HashMap;
245
246    const MAX_CACHE_ENTRIES: usize = 4096;
247
248    thread_local! {
249        static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
250    }
251
252    let mut hash: u64 = 0xcbf29ce484222325;
253    for &byte in candidate.as_bytes() {
254        hash ^= u64::from(byte);
255        hash = hash.wrapping_mul(0x100000001b3);
256    }
257
258    CACHE.with(|cache| {
259        if let Some(&verdict) = cache.borrow().get(&hash) {
260            return verdict;
261        }
262        let verdict = compute_decoded_is_base64_blob(candidate);
263        let mut cache = cache.borrow_mut();
264        if cache.len() >= MAX_CACHE_ENTRIES {
265            cache.clear();
266        }
267        cache.insert(hash, verdict);
268        verdict
269    })
270}
271
272fn compute_decoded_is_base64_blob(candidate: &str) -> bool {
273    let trimmed = candidate.trim();
274    if trimmed.len() < MIN_DECODE_LEN {
275        return false;
276    }
277    let Some(bytes) = decode_candidate(trimmed) else {
278        return false;
279    };
280    if bytes.len() < 32 {
281        return false;
282    }
283    bytes
284        .iter()
285        .all(|&b| b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'='))
286}
287
288/// Decode `candidate` (base64 / url-safe-base64 / hex) and check whether the
289/// decoded bytes contain any placeholder word case-insensitively. Composes
290/// keyhog's decode-through with the placeholder suppression: a docs sample
291/// that arrives base64-wrapped (e.g. AWS docs publishing AKIAEXAMPLEEXAMPLE12
292/// as the base64-encoded body of a yaml secret) is now recognized as a sample
293/// even though the surface form looks like high-entropy random bytes. Mirror
294/// v26: 9 docs-example-marker FPs (all `QUtJQUVYQU1QTEVFWEFNUExFMTI=`, base64
295/// of AKIA...EXAMPLE...12) collapsed by this gate. Memoized to match the
296/// existing `is_encoded_binary` call cadence.
297#[must_use]
298pub fn decoded_contains_placeholder(candidate: &str) -> bool {
299    use std::cell::RefCell;
300    use std::collections::HashMap;
301
302    const MAX_CACHE_ENTRIES: usize = 4096;
303
304    thread_local! {
305        static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
306    }
307
308    // FNV-1a over the candidate bytes - keyed identically to is_encoded_binary
309    // so the two caches cost a single hash per credential.
310    let mut hash: u64 = 0xcbf29ce484222325;
311    for &byte in candidate.as_bytes() {
312        hash ^= u64::from(byte);
313        hash = hash.wrapping_mul(0x100000001b3);
314    }
315
316    CACHE.with(|cache| {
317        if let Some(&verdict) = cache.borrow().get(&hash) {
318            return verdict;
319        }
320        let verdict = compute_decoded_contains_placeholder(candidate);
321        let mut cache = cache.borrow_mut();
322        if cache.len() >= MAX_CACHE_ENTRIES {
323            cache.clear();
324        }
325        cache.insert(hash, verdict);
326        verdict
327    })
328}
329
330fn compute_decoded_contains_placeholder(candidate: &str) -> bool {
331    let trimmed = candidate.trim();
332    if trimmed.len() < MIN_DECODE_LEN {
333        return false;
334    }
335    let Some(bytes) = decode_candidate(trimmed) else {
336        return false;
337    };
338    if bytes.is_empty() {
339        return false;
340    }
341    PLACEHOLDER_WORDS.iter().any(|word| {
342        bytes
343            .windows(word.len())
344            .any(|window| window.eq_ignore_ascii_case(word))
345    })
346}
347
348/// Decode `candidate` (base64 standard, base64 url-safe, or hex) and describe
349/// the resulting bytes. Returns a default (non-decodable) structure when the
350/// candidate is too short or not a clean encoding.
351#[must_use]
352pub fn analyze(candidate: &str) -> DecodeStructure {
353    let trimmed = candidate.trim();
354    if trimmed.len() < MIN_DECODE_LEN {
355        return DecodeStructure::default();
356    }
357    let Some(bytes) = decode_candidate(trimmed) else {
358        return DecodeStructure::default();
359    };
360    if bytes.is_empty() {
361        return DecodeStructure::default();
362    }
363    let printable = bytes
364        .iter()
365        .filter(|&&b| (32..127).contains(&b) || matches!(b, 9 | 10 | 13))
366        .count();
367    DecodeStructure {
368        decodable: true,
369        decoded_len: bytes.len(),
370        printable_ratio: printable as f32 / bytes.len() as f32,
371        magic: magic_format(&bytes),
372        protobuf_wire: parse_protobuf_wire(&bytes),
373    }
374}
375
376/// Decode the candidate as base64 (standard then url-safe, padded or not) or,
377/// failing that, as an even-length all-hex string. Only accepts clean,
378/// whole-string decodes so a stray match does not masquerade as binary.
379fn decode_candidate(s: &str) -> Option<Vec<u8>> {
380    // base64 alphabets are a superset of hex's, so try base64 first and only
381    // fall back to hex for strings that are NOT valid base64.
382    let looks_b64 = s
383        .bytes()
384        .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'-' | b'_' | b'='));
385    if looks_b64 {
386        // Pad to a multiple of 4 so unpadded blobs decode.
387        let mut padded = s.to_string();
388        let rem = padded.len() % 4;
389        if rem != 0 {
390            padded.push_str(&"=".repeat(4 - rem));
391        }
392        if let Ok(b) = base64::engine::general_purpose::STANDARD.decode(padded.as_bytes()) {
393            return Some(b);
394        }
395        if let Ok(b) = base64::engine::general_purpose::URL_SAFE.decode(padded.as_bytes()) {
396            return Some(b);
397        }
398    }
399    if s.len() >= MIN_DECODE_LEN && s.len() % 2 == 0 && s.bytes().all(|b| b.is_ascii_hexdigit()) {
400        let mut out = Vec::with_capacity(s.len() / 2);
401        let raw = s.as_bytes();
402        let mut i = 0;
403        while i + 1 < raw.len() {
404            let hi = (raw[i] as char).to_digit(16)?;
405            let lo = (raw[i + 1] as char).to_digit(16)?;
406            out.push(((hi << 4) | lo) as u8);
407            i += 2;
408        }
409        return Some(out);
410    }
411    None
412}
413
414/// Identify common binary container/asset formats by their leading magic
415/// bytes. These headers are definitional: a stream that starts with them IS
416/// that format, and no credential carries them.
417fn magic_format(b: &[u8]) -> Option<&'static str> {
418    const SIGS: &[(&[u8], &str)] = &[
419        (b"\x89PNG\r\n\x1a\n", "png"),
420        (b"\xff\xd8\xff", "jpeg"),
421        (b"GIF87a", "gif"),
422        (b"GIF89a", "gif"),
423        (b"\x1f\x8b", "gzip"),
424        (b"BZh", "bzip2"),
425        (b"\xfd7zXZ\x00", "xz"),
426        (b"\x28\xb5\x2f\xfd", "zstd"),
427        (b"PK\x03\x04", "zip"),
428        (b"PK\x05\x06", "zip"),
429        (b"7z\xbc\xaf\x27\x1c", "7z"),
430        (b"Rar!\x1a\x07", "rar"),
431        (b"%PDF-", "pdf"),
432        (b"\x7fELF", "elf"),
433        (b"\xfe\xed\xfa\xce", "mach-o"),
434        (b"\xfe\xed\xfa\xcf", "mach-o"),
435        (b"\xcf\xfa\xed\xfe", "mach-o"),
436        (b"\xca\xfe\xba\xbe", "java-class"),
437        (b"MZ", "pe"),
438        (b"SQLite format 3\x00", "sqlite"),
439        (b"OggS", "ogg"),
440        (b"RIFF", "riff"),
441        (b"\x00\x61\x73\x6d", "wasm"),
442        // zlib streams: 0x78 followed by a valid FLEVEL byte.
443        (b"\x78\x01", "zlib"),
444        (b"\x78\x9c", "zlib"),
445        (b"\x78\xda", "zlib"),
446        (b"\x78\x5e", "zlib"),
447    ];
448    SIGS.iter()
449        .find(|(sig, _)| b.starts_with(sig))
450        .map(|(_, name)| *name)
451}
452
453/// Parse `data` as a protobuf wire stream. Returns true only when the entire
454/// buffer is consumed by >= 3 valid (tag, value) fields with valid wire types -
455/// the profile of a real serialized message, which random bytes hit < 0.5% of
456/// the time.
457fn parse_protobuf_wire(data: &[u8]) -> bool {
458    let n = data.len();
459    if n < 8 {
460        return false;
461    }
462    let mut i = 0usize;
463    let mut fields = 0u32;
464    while i < n {
465        let Some((tag, next)) = read_varint(data, i) else {
466            return false;
467        };
468        i = next;
469        let wire = tag & 0x07;
470        let field_no = tag >> 3;
471        if field_no == 0 {
472            return false;
473        }
474        match wire {
475            0 => {
476                // varint value
477                let Some((_, next)) = read_varint(data, i) else {
478                    return false;
479                };
480                i = next;
481            }
482            1 => {
483                // 64-bit fixed
484                match i.checked_add(8) {
485                    Some(x) if x <= n => i = x,
486                    _ => return false,
487                }
488            }
489            2 => {
490                // length-delimited
491                let Some((len, next)) = read_varint(data, i) else {
492                    return false;
493                };
494                i = match next.checked_add(len as usize) {
495                    Some(x) if x <= n => x,
496                    _ => return false,
497                };
498            }
499            5 => {
500                // 32-bit fixed
501                match i.checked_add(4) {
502                    Some(x) if x <= n => i = x,
503                    _ => return false,
504                }
505            }
506            _ => return false, // 3,4 (groups, deprecated) and 6,7 (invalid)
507        }
508        fields += 1;
509    }
510    i == n && fields >= 3
511}
512
513/// Read a base-128 varint at `data[start..]`, returning (value, next_index).
514fn read_varint(data: &[u8], start: usize) -> Option<(u64, usize)> {
515    let mut value: u64 = 0;
516    let mut shift = 0u32;
517    let mut i = start;
518    loop {
519        let b = *data.get(i)?;
520        i += 1;
521        value |= u64::from(b & 0x7F) << shift;
522        if b & 0x80 == 0 {
523            return Some((value, i));
524        }
525        shift += 7;
526        if shift > 63 {
527            return None;
528        }
529    }
530}