keyhog_scanner/decode/mod.rs
1//! Decode-through scanning: decode base64 and hex strings before pattern matching.
2//!
3//! Catches secrets hidden behind encoding layers - Kubernetes manifests,
4//! CI/CD configs, and hex-encoded credentials.
5
6mod base64;
7pub mod caesar;
8pub mod hex;
9mod json;
10mod pipeline;
11pub mod reverse;
12mod unicode_escape;
13mod url;
14pub mod util;
15
16pub use base64::{base64_decode, find_base64_strings, z85_decode};
17pub use hex::hex_decode;
18pub use pipeline::{decode_chunk, register_decoder};
19
20use keyhog_core::Chunk;
21
22/// Minimum contiguous encoded-alphabet run that makes a chunk worth decoding.
23/// A base64 of a ~16-byte secret is ~24 chars; shorter runs are too small to
24/// hide a credential and would only add prefilter-bypass cost.
25const MIN_DECODABLE_RUN: usize = 24;
26
27/// Cheap O(n), allocation-free gate: does `data` contain a contiguous run of
28/// base64-/hex-alphabet bytes long enough to plausibly hide an encoded secret?
29///
30/// The direct-match prefilters (`AlphabetScreen`, the bigram bloom) reject a
31/// chunk that carries none of any detector's literal bytes/bigrams - which is
32/// EXACTLY the shape of a fully-encoded secret (`data = "<base64>"`), whose
33/// plaintext keyword/prefix only appears AFTER decoding. Those chunks would be
34/// dropped before decode-through ever ran. This gate lets the scan entry route
35/// such a chunk into a decode-only pass instead of skipping it, bounded to
36/// chunks that actually look encoded so normal traffic keeps the fast skip.
37pub(crate) fn has_decodable_payload(data: &[u8]) -> bool {
38 let mut run = 0usize;
39 for &b in data {
40 // base64 (standard + url-safe) and hex share this alphabet; padding
41 // `=` is included so a trailing-padded blob still counts.
42 let encoded_byte =
43 b.is_ascii_alphanumeric() || matches!(b, b'+' | b'/' | b'=' | b'-' | b'_');
44 if encoded_byte {
45 run += 1;
46 if run >= MIN_DECODABLE_RUN {
47 return true;
48 }
49 } else {
50 run = 0;
51 }
52 }
53 false
54}
55
56/// A trait for decoding chunks to find hidden secrets.
57pub trait Decoder: Send + Sync {
58 fn name(&self) -> &'static str;
59 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk>;
60}
61
62/// Candidate encoded string discovered during pre-decoding extraction.
63pub struct EncodedString {
64 pub value: String,
65}