keyhog-scanner 0.5.37

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
//! Decode-structure analysis: keyhog's decode-through advantage, fed into
//! scoring.
//!
//! A generic high-entropy candidate (caught by `generic-secret`,
//! `generic-password`, `entropy-*`) is ambiguous on its surface: a real
//! base64/hex secret and a base64-wrapped *binary asset* (a PNG, a gzip blob,
//! a serialized protobuf, an embedded cert) look identical to an
//! entropy/regex/token-efficiency filter. The distinguishing signal is what
//! the candidate *decodes to* - and keyhog already decodes. This module turns
//! the decoded bytes into a verdict the confidence pipeline (and, later, the ML
//! feature vector) can use.
//!
//! The verdict is built only on **definitional** signals, so it never
//! false-suppresses a real credential:
//!   * **Magic bytes.** A blob that decodes to a PNG/JPEG/GIF/gzip/zip/PDF/ELF/
//!     Mach-O/PE/zstd/xz/bzip2/7z/SQLite/Java-class header IS that format. Over
//!     3000 random 24-48 byte secrets, ZERO carry any of these headers at
//!     offset 0 (they are 4-8 specific bytes out of 256^k).
//!   * **Full protobuf-wire parse.** Bytes that parse end-to-end as a protobuf
//!     wire stream (valid field tags, valid wire types, length-delimited fields
//!     that stay in bounds, whole buffer consumed) with several fields are a
//!     serialized message. Random bytes parse this way <0.5% of the time, and
//!     we additionally require >= 3 fields and >= 8 bytes.
//!
//! Printable-ratio is recorded for the future ML feature but is NOT used in the
//! boolean verdict: random secret bytes and binary blobs both sit around 37-50%
//! printable, so it is too weak to gate suppression on its own.
//!
//! Tests live in `tests/unit/decode_structure*.rs` (Santh no-inline-tests
//! contract).

use base64::Engine;

/// Structured view of what a candidate decodes to. Carried as-is into the ML
/// feature vector once the model is retrained; consumed today by
/// [`is_encoded_binary`].
#[derive(Debug, Clone, Default, PartialEq)]
pub struct DecodeStructure {
    /// The candidate is a syntactically valid base64 (standard or url-safe) or
    /// hex string of a length worth decoding.
    pub decodable: bool,
    /// Number of bytes the candidate decoded to (0 when not decodable).
    pub decoded_len: usize,
    /// Fraction of decoded bytes that are printable ASCII (incl. tab/newline).
    pub printable_ratio: f32,
    /// Identified container/format from the decoded magic bytes, if any.
    pub magic: Option<&'static str>,
    /// The decoded bytes parse end-to-end as a multi-field protobuf wire stream.
    pub protobuf_wire: bool,
}

impl DecodeStructure {
    /// True when the decoded bytes are an identifiable binary asset or a
    /// serialized protobuf message - i.e. data, not a credential.
    #[must_use]
    pub fn is_binary_payload(&self) -> bool {
        self.magic.is_some() || (self.protobuf_wire && self.decoded_len >= 8)
    }
}

/// Minimum candidate length before we bother decoding. A base64 blob needs
/// >= 8 chars to carry a 4-byte magic header, and short tokens are the job of
/// the named detectors anyway.
const MIN_DECODE_LEN: usize = 16;

/// Conservative verdict for the confidence pipeline: does this generic
/// candidate decode to identifiable binary / serialized data? Real secrets
/// return `false`.
///
/// Memoized: a single match is scored on this twice (ML feature #41 in
/// `ml_features` and the generic-detector confidence penalty in
/// `confidence::penalties`), and a scan re-encounters the same token across
/// chunks. Without the cache every call re-decodes and re-parses the bytes.
/// Thread-local + bounded with wholesale eviction, mirroring
/// `entropy::shannon_entropy`. The verdict is a pure function of `candidate`,
/// so caching by content hash is always correct.
#[must_use]
pub fn is_encoded_binary(candidate: &str) -> bool {
    use std::cell::RefCell;
    use std::collections::HashMap;

    const MAX_CACHE_ENTRIES: usize = 4096;

    thread_local! {
        static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
    }

    // FNV-1a over the candidate bytes - the same hash the entropy / ML-score
    // caches key on.
    let mut hash: u64 = 0xcbf29ce484222325;
    for &byte in candidate.as_bytes() {
        hash ^= u64::from(byte);
        hash = hash.wrapping_mul(0x100000001b3);
    }

    CACHE.with(|cache| {
        if let Some(&verdict) = cache.borrow().get(&hash) {
            return verdict;
        }
        let verdict = analyze(candidate).is_binary_payload();
        let mut cache = cache.borrow_mut();
        if cache.len() >= MAX_CACHE_ENTRIES {
            cache.clear();
        }
        cache.insert(hash, verdict);
        verdict
    })
}

/// Placeholder words that mark a credential as a documentation sample, not a
/// real secret. Shared with `confidence::penalties::contains_placeholder_word`
/// for the SURFACE form; this module's [`decoded_contains_placeholder`] runs
/// the same check against the BASE64 / HEX decoded form so a base64-wrapped
/// `AKIAEXAMPLEEXAMPLE12` (= `QUtJQUVYQU1QTEVFWEFNUExFMTI=`) is still caught.
const DECODED_PLACEHOLDER_WORDS: &[&[u8]] = &[
    b"example",
    b"dummy",
    b"fake",
    b"sample",
    b"placeholder",
    b"changeme",
];

/// Shape-only check: does `value` look like a uniform base64 blob with no
/// structure markers? Strict criteria - 60+ chars long, multiple-of-4 length
/// or trailing `=` padding, every char in the standard base64 alphabet,
/// AND contains at least one `+` / `/` punct or has padding. Matches the
/// `random-base64-protobuf` corpus shape (random bytes base64-encoded into
/// a `password=`/`secret=` slot) without firing on real service-anchored
/// credentials:
///   * AWS secret access keys (40 base62 chars, no +/, no padding) - skipped
///   * GitHub PATs (40+ chars but contain `_`) - skipped
///   * npm tokens (36 chars base62) - too short, skipped
///   * Stripe keys (32 chars, `sk_`/`pk_` prefix with `_`) - skipped
///   * Slack tokens (xox*-prefixed with `-`) - skipped
///   * JWT tokens (`.` separators) - skipped
///   * OAuth bearer tokens at 40-59 chars - skipped via 60-char floor
///
/// Used by `confidence::penalties::apply_post_ml_penalties` as the generic-
/// detector branch's "this is a random base64 blob, not a credential" gate.
/// Mirror v27 had 56 base64-protobuf FPs surviving every other suppression;
/// this is the dedicated gate for that class.
#[must_use]
pub fn looks_like_uniform_base64_blob(value: &str) -> bool {
    if !(60..=300).contains(&value.len()) {
        return false;
    }
    let has_padding = value.ends_with("==") || value.ends_with('=');
    let length_mult_4 = value.len() % 4 == 0;
    if !has_padding && !length_mult_4 {
        return false;
    }
    let mut has_b64_punct = false;
    for b in value.bytes() {
        match b {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'=' => {}
            b'+' | b'/' => has_b64_punct = true,
            _ => return false,
        }
    }
    has_b64_punct || has_padding
}

/// Decode `candidate` (base64 / url-safe-base64 / hex) and check whether the
/// decoded bytes contain any placeholder word case-insensitively. Composes
/// keyhog's decode-through with the placeholder suppression: a docs sample
/// that arrives base64-wrapped (e.g. AWS docs publishing AKIAEXAMPLEEXAMPLE12
/// as the base64-encoded body of a yaml secret) is now recognized as a sample
/// even though the surface form looks like high-entropy random bytes. Mirror
/// v26: 9 docs-example-marker FPs (all `QUtJQUVYQU1QTEVFWEFNUExFMTI=`, base64
/// of AKIA...EXAMPLE...12) collapsed by this gate. Memoized to match the
/// existing `is_encoded_binary` call cadence.
#[must_use]
pub fn decoded_contains_placeholder(candidate: &str) -> bool {
    use std::cell::RefCell;
    use std::collections::HashMap;

    const MAX_CACHE_ENTRIES: usize = 4096;

    thread_local! {
        static CACHE: RefCell<HashMap<u64, bool>> = RefCell::new(HashMap::with_capacity(256));
    }

    // FNV-1a over the candidate bytes - keyed identically to is_encoded_binary
    // so the two caches cost a single hash per credential.
    let mut hash: u64 = 0xcbf29ce484222325;
    for &byte in candidate.as_bytes() {
        hash ^= u64::from(byte);
        hash = hash.wrapping_mul(0x100000001b3);
    }

    CACHE.with(|cache| {
        if let Some(&verdict) = cache.borrow().get(&hash) {
            return verdict;
        }
        let verdict = compute_decoded_contains_placeholder(candidate);
        let mut cache = cache.borrow_mut();
        if cache.len() >= MAX_CACHE_ENTRIES {
            cache.clear();
        }
        cache.insert(hash, verdict);
        verdict
    })
}

fn compute_decoded_contains_placeholder(candidate: &str) -> bool {
    let trimmed = candidate.trim();
    if trimmed.len() < MIN_DECODE_LEN {
        return false;
    }
    let Some(bytes) = decode_candidate(trimmed) else {
        return false;
    };
    if bytes.is_empty() {
        return false;
    }
    DECODED_PLACEHOLDER_WORDS.iter().any(|word| {
        bytes
            .windows(word.len())
            .any(|window| window.eq_ignore_ascii_case(word))
    })
}

/// Decode `candidate` (base64 standard, base64 url-safe, or hex) and describe
/// the resulting bytes. Returns a default (non-decodable) structure when the
/// candidate is too short or not a clean encoding.
#[must_use]
pub fn analyze(candidate: &str) -> DecodeStructure {
    let trimmed = candidate.trim();
    if trimmed.len() < MIN_DECODE_LEN {
        return DecodeStructure::default();
    }
    let Some(bytes) = decode_candidate(trimmed) else {
        return DecodeStructure::default();
    };
    if bytes.is_empty() {
        return DecodeStructure::default();
    }
    let printable = bytes
        .iter()
        .filter(|&&b| (32..127).contains(&b) || matches!(b, 9 | 10 | 13))
        .count();
    DecodeStructure {
        decodable: true,
        decoded_len: bytes.len(),
        printable_ratio: printable as f32 / bytes.len() as f32,
        magic: magic_format(&bytes),
        protobuf_wire: parse_protobuf_wire(&bytes),
    }
}

/// Decode the candidate as base64 (standard then url-safe, padded or not) or,
/// failing that, as an even-length all-hex string. Only accepts clean,
/// whole-string decodes so a stray match does not masquerade as binary.
fn decode_candidate(s: &str) -> Option<Vec<u8>> {
    // base64 alphabets are a superset of hex's, so try base64 first and only
    // fall back to hex for strings that are NOT valid base64.
    let looks_b64 = s
        .bytes()
        .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'-' | b'_' | b'='));
    if looks_b64 {
        // Pad to a multiple of 4 so unpadded blobs decode.
        let mut padded = s.to_string();
        let rem = padded.len() % 4;
        if rem != 0 {
            padded.push_str(&"=".repeat(4 - rem));
        }
        if let Ok(b) = base64::engine::general_purpose::STANDARD.decode(padded.as_bytes()) {
            return Some(b);
        }
        if let Ok(b) = base64::engine::general_purpose::URL_SAFE.decode(padded.as_bytes()) {
            return Some(b);
        }
    }
    if s.len() >= MIN_DECODE_LEN && s.len() % 2 == 0 && s.bytes().all(|b| b.is_ascii_hexdigit()) {
        let mut out = Vec::with_capacity(s.len() / 2);
        let raw = s.as_bytes();
        let mut i = 0;
        while i + 1 < raw.len() {
            let hi = (raw[i] as char).to_digit(16)?;
            let lo = (raw[i + 1] as char).to_digit(16)?;
            out.push(((hi << 4) | lo) as u8);
            i += 2;
        }
        return Some(out);
    }
    None
}

/// Identify common binary container/asset formats by their leading magic
/// bytes. These headers are definitional: a stream that starts with them IS
/// that format, and no credential carries them.
fn magic_format(b: &[u8]) -> Option<&'static str> {
    const SIGS: &[(&[u8], &str)] = &[
        (b"\x89PNG\r\n\x1a\n", "png"),
        (b"\xff\xd8\xff", "jpeg"),
        (b"GIF87a", "gif"),
        (b"GIF89a", "gif"),
        (b"\x1f\x8b", "gzip"),
        (b"BZh", "bzip2"),
        (b"\xfd7zXZ\x00", "xz"),
        (b"\x28\xb5\x2f\xfd", "zstd"),
        (b"PK\x03\x04", "zip"),
        (b"PK\x05\x06", "zip"),
        (b"7z\xbc\xaf\x27\x1c", "7z"),
        (b"Rar!\x1a\x07", "rar"),
        (b"%PDF-", "pdf"),
        (b"\x7fELF", "elf"),
        (b"\xfe\xed\xfa\xce", "mach-o"),
        (b"\xfe\xed\xfa\xcf", "mach-o"),
        (b"\xcf\xfa\xed\xfe", "mach-o"),
        (b"\xca\xfe\xba\xbe", "java-class"),
        (b"MZ", "pe"),
        (b"SQLite format 3\x00", "sqlite"),
        (b"OggS", "ogg"),
        (b"RIFF", "riff"),
        (b"\x00\x61\x73\x6d", "wasm"),
        // zlib streams: 0x78 followed by a valid FLEVEL byte.
        (b"\x78\x01", "zlib"),
        (b"\x78\x9c", "zlib"),
        (b"\x78\xda", "zlib"),
        (b"\x78\x5e", "zlib"),
    ];
    SIGS.iter()
        .find(|(sig, _)| b.starts_with(sig))
        .map(|(_, name)| *name)
}

/// Parse `data` as a protobuf wire stream. Returns true only when the entire
/// buffer is consumed by >= 3 valid (tag, value) fields with valid wire types -
/// the profile of a real serialized message, which random bytes hit < 0.5% of
/// the time.
fn parse_protobuf_wire(data: &[u8]) -> bool {
    let n = data.len();
    if n < 8 {
        return false;
    }
    let mut i = 0usize;
    let mut fields = 0u32;
    while i < n {
        let Some((tag, next)) = read_varint(data, i) else {
            return false;
        };
        i = next;
        let wire = tag & 0x07;
        let field_no = tag >> 3;
        if field_no == 0 {
            return false;
        }
        match wire {
            0 => {
                // varint value
                let Some((_, next)) = read_varint(data, i) else {
                    return false;
                };
                i = next;
            }
            1 => {
                // 64-bit fixed
                match i.checked_add(8) {
                    Some(x) if x <= n => i = x,
                    _ => return false,
                }
            }
            2 => {
                // length-delimited
                let Some((len, next)) = read_varint(data, i) else {
                    return false;
                };
                i = match next.checked_add(len as usize) {
                    Some(x) if x <= n => x,
                    _ => return false,
                };
            }
            5 => {
                // 32-bit fixed
                match i.checked_add(4) {
                    Some(x) if x <= n => i = x,
                    _ => return false,
                }
            }
            _ => return false, // 3,4 (groups, deprecated) and 6,7 (invalid)
        }
        fields += 1;
    }
    i == n && fields >= 3
}

/// Read a base-128 varint at `data[start..]`, returning (value, next_index).
fn read_varint(data: &[u8], start: usize) -> Option<(u64, usize)> {
    let mut value: u64 = 0;
    let mut shift = 0u32;
    let mut i = start;
    loop {
        let b = *data.get(i)?;
        i += 1;
        value |= u64::from(b & 0x7F) << shift;
        if b & 0x80 == 0 {
            return Some((value, i));
        }
        shift += 7;
        if shift > 63 {
            return None;
        }
    }
}