Skip to main content

keyhog_scanner/decode/
mod.rs

1//! Decode-through scanning: decode base64 and hex strings before pattern matching.
2//!
3//! Catches secrets hidden behind encoding layers - Kubernetes manifests,
4//! CI/CD configs, and hex-encoded credentials.
5
6mod base64;
7pub mod caesar;
8pub mod hex;
9mod json;
10mod pipeline;
11pub mod reverse;
12mod unicode_escape;
13mod url;
14pub mod util;
15
16pub use base64::{base64_decode, find_base64_strings, z85_decode};
17pub use hex::hex_decode;
18pub use pipeline::{decode_chunk, register_decoder};
19
20use keyhog_core::Chunk;
21
22/// Minimum contiguous encoded-alphabet run that makes a chunk worth decoding.
23/// A base64 of a ~16-byte secret is ~24 chars; shorter runs are too small to
24/// hide a credential and would only add prefilter-bypass cost.
25const MIN_DECODABLE_RUN: usize = 24;
26
27/// Cheap O(n), allocation-free gate: does `data` contain a contiguous run of
28/// base64-/hex-alphabet bytes long enough to plausibly hide an encoded secret?
29///
30/// The direct-match prefilters (`AlphabetScreen`, the bigram bloom) reject a
31/// chunk that carries none of any detector's literal bytes/bigrams - which is
32/// EXACTLY the shape of a fully-encoded secret (`data = "<base64>"`), whose
33/// plaintext keyword/prefix only appears AFTER decoding. Those chunks would be
34/// dropped before decode-through ever ran. This gate lets the scan entry route
35/// such a chunk into a decode-only pass instead of skipping it, bounded to
36/// chunks that actually look encoded so normal traffic keeps the fast skip.
37pub(crate) fn has_decodable_payload(data: &[u8]) -> bool {
38    let mut run = 0usize;
39    for &b in data {
40        // base64 (standard + url-safe) and hex share this alphabet; padding
41        // `=` is included so a trailing-padded blob still counts.
42        let encoded_byte =
43            b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'=' | b'-' | b'_');
44        if encoded_byte {
45            run += 1;
46            if run >= MIN_DECODABLE_RUN {
47                return true;
48            }
49        } else {
50            run = 0;
51        }
52    }
53    false
54}
55
56/// A trait for decoding chunks to find hidden secrets.
57pub trait Decoder: Send + Sync {
58    fn name(&self) -> &'static str;
59    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk>;
60}
61
62/// Candidate encoded string discovered during pre-decoding extraction.
63pub struct EncodedString {
64    pub value: String,
65}