keyhog_scanner/decode/
pipeline.rs

1use super::base64::{Base64Decoder, Z85Decoder};
2use super::caesar::CaesarDecoder;
3use super::hex::HexDecoder;
4use super::json::JsonDecoder;
5use super::reverse::ReverseDecoder;
6use super::url::{
7    HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
8    OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
9};
10use super::Decoder;
11use keyhog_core::{Chunk, ChunkMetadata};
12use std::collections::{HashSet, VecDeque};
13
14static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> = std::sync::OnceLock::new();
15
16const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
17const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
18/// Hard ceiling on the wall-clock time decode_chunk may spend on ONE chunk
19/// when the caller didn't pass an explicit deadline. Mitigates decode-bomb
20/// inputs (multi-layer base64 of unrelated data) that the existing
21/// MAX_DECODED_TOTAL_BYTES cap doesn't catch when each layer fits under the
22/// total budget but together blow the wall budget. Tuned generously: 50 ms
23/// is ~10x the cost of a normal chunk's full decode-through; pathological
24/// inputs hit it before the user notices.
25const DEFAULT_DECODE_WALL_BUDGET_MS: u64 = 50;
26
27fn get_decoders() -> &'static [Box<dyn Decoder>] {
28    DECODERS.get_or_init(|| {
29        vec![
30            Box::new(Base64Decoder),
31            Box::new(HexDecoder),
32            Box::new(UrlDecoder),
33            Box::new(QuotedPrintableDecoder),
34            Box::new(HtmlNamedEntityDecoder),
35            Box::new(HtmlNumericEntityDecoder),
36            Box::new(HexEscapeDecoder),
37            Box::new(OctalEscapeDecoder),
38            Box::new(MimeEncodedWordDecoder),
39            Box::new(UnicodeEscapeDecoder),
40            // JSON unescape - strips `\"` / `\\` / `\n` style escapes
41            // inside JSON string values so credentials stored as
42            // JSON-encoded fields (the most common shape after .env)
43            // survive into the scanner. Originally implemented but
44            // never registered - the adversarial_explosion_runner's
45            // `json` wrapper class surfaced ~73 misses that wiring
46            // this in closed (5792/5792 variants now fire).
47            Box::new(JsonDecoder),
48            Box::new(Z85Decoder),
49            Box::new(ReverseDecoder),
50            Box::new(CaesarDecoder),
51        ]
52    })
53}
54
55/// Register a custom decoder. Must be called BEFORE any scan runs.
56/// Panics if the decoder list has already been initialized.
57pub fn register_decoder(decoder: Box<dyn Decoder>) {
58    // After initialization, the decoder list is immutable for lock-free reads.
59    // Custom decoders must be registered before the first scan.
60    if DECODERS.get().is_some() {
61        tracing::warn!("register_decoder called after initialization: decoder ignored. Fix: register custom decoders before scanning.");
62        return;
63    }
64    // KEEP THIS LIST IN SYNC with `get_decoders()` above - they're
65    // two paths to the same initialized state, and a decoder missing
66    // here would silently vanish from any custom-decoder-registered
67    // run.
68    let mut decoders: Vec<Box<dyn Decoder>> = vec![
69        Box::new(Base64Decoder),
70        Box::new(HexDecoder),
71        Box::new(UrlDecoder),
72        Box::new(QuotedPrintableDecoder),
73        Box::new(HtmlNamedEntityDecoder),
74        Box::new(HtmlNumericEntityDecoder),
75        Box::new(HexEscapeDecoder),
76        Box::new(OctalEscapeDecoder),
77        Box::new(MimeEncodedWordDecoder),
78        Box::new(UnicodeEscapeDecoder),
79        Box::new(JsonDecoder),
80        Box::new(Z85Decoder),
81        Box::new(ReverseDecoder),
82        Box::new(CaesarDecoder),
83    ];
84    decoders.push(decoder);
85    let _ = DECODERS.set(decoders);
86}
87
88pub fn decode_chunk(
89    chunk: &Chunk,
90    max_depth: usize,
91    validate: bool,
92    deadline: Option<std::time::Instant>,
93    screen: Option<&crate::alphabet_filter::AlphabetScreen>,
94) -> Vec<Chunk> {
95    // NOTE: a blanket `has_decodable_payload` early-out was tried here
96    // (AUD-speed-2) and reverted: that predicate only recognises base64/hex
97    // alphabet runs, but the pipeline also runs URL/percent, HTML-entity,
98    // hex/octal/unicode-escape, MIME-word, quoted-printable and JSON decoders
99    // whose triggers it does not cover. Gating the whole fan-out on it silently
100    // dropped ~7% of credentials under structured-format wrapping
101    // (`every_contract_positive_fires_under_every_format_wrapper`). A correct
102    // superset gate fires on `% & \ " { =` — which saturate real source — so it
103    // buys almost nothing; the genuine cost (Caesar's 25× fan-out over the full
104    // chunk) belongs gated at the Caesar decoder on its own alphabetic-run
105    // precondition, not as a pipeline-wide recall hazard.
106    let mut decoded_chunks = Vec::new();
107    let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
108    // Use hash of data instead of full string to save memory on large files.
109    let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
110    let mut total_bytes = 0usize;
111    // Count EVERY unique decoded chunk against the per-root fan-out cap,
112    // not just the ones that pass the alphabet screen and get returned
113    // (M2). Screen-failing chunks were still queued and recursively
114    // re-decoded but never incremented `decoded_chunks.len()`, so on the
115    // live screen-enabled path the 1000-chunk DoS guard never bound a
116    // high-fan-out decoder (Caesar emits up to 25 variants/candidate,
117    // most failing the screen). The screen decides whether a chunk is
118    // RETURNED for scanning; this counter decides the recursion budget.
119    let mut produced = 0usize;
120
121    let registry = get_decoders();
122
123    // Per-chunk wall-clock ceiling. Always apply the TIGHTER of the
124    // caller-supplied `deadline` and our own `DEFAULT_DECODE_WALL_BUDGET_MS`
125    // ceiling. kimi-wave1 audit finding 5.2: previously the caller's
126    // (long) scan deadline overrode this guard, letting a decode-bomb
127    // chunk consume the entire scan budget.
128    let local_ceiling =
129        std::time::Instant::now() + std::time::Duration::from_millis(DEFAULT_DECODE_WALL_BUDGET_MS);
130    let effective_deadline = match deadline {
131        Some(d) => d.min(local_ceiling),
132        None => local_ceiling,
133    };
134
135    while let Some((current, depth)) = queue.pop_front() {
136        if std::time::Instant::now() > effective_deadline {
137            tracing::debug!(
138                path = ?chunk.metadata.path,
139                budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
140                "decode budget exhausted; stopping decode-through"
141            );
142            break;
143        }
144        if depth >= max_depth {
145            continue;
146        }
147
148        for decoder in registry.iter() {
149            // Re-check the wall-clock budget BEFORE each decoder's
150            // candidate fan-out (C9). The top-of-loop check only fires
151            // once per BFS dequeue, so a single chunk could run all 14
152            // decoders to completion with no budget check, blowing far past
153            // DEFAULT_DECODE_WALL_BUDGET_MS on one chunk. This check stops us
154            // from even invoking the next decoder once the deadline trips;
155            // the matching check inside the inner loop below stops us
156            // consuming the CURRENT decoder's (un-bounded) output.
157            if std::time::Instant::now() > effective_deadline {
158                tracing::debug!(
159                    path = ?chunk.metadata.path,
160                    budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
161                    "decode budget exhausted mid-fan-out; stopping decode-through"
162                );
163                return decoded_chunks;
164            }
165            for decoded in decoder.decode_chunk(&current) {
166                // Re-check the budget WHILE consuming this decoder's output
167                // (C9 root cause). The pre-decoder check above only fires
168                // once per decoder, but `decode_chunk` returns a fully
169                // materialized Vec whose length is O(chunk size) -
170                // `extract_encoded_values` yields one candidate per quoted
171                // string / `key=value` / base64 run, and Caesar fans each out
172                // 25x. Without this check the pipeline still hashes, screens,
173                // clones, and queues every one of those results AFTER the
174                // deadline has passed, so a single dense chunk's fan-out
175                // (tens of thousands of results) ran the per-result work to
176                // completion regardless of the wall budget. The
177                // `decoder.decode_chunk` call itself cannot be interrupted
178                // (trait returns an owned Vec), but bailing here bounds the
179                // post-deadline overrun to one decoder's fan-out at most -
180                // and stops the (dominant) per-result processing cost dead.
181                if std::time::Instant::now() > effective_deadline {
182                    tracing::debug!(
183                        path = ?chunk.metadata.path,
184                        budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
185                        "decode budget exhausted while consuming decoder output; \
186                         stopping decode-through"
187                    );
188                    return decoded_chunks;
189                }
190                if seen.insert(hash_fast(decoded.data.as_bytes())) {
191                    // Optional sanitization (kimi-wave1 audit finding 5.1).
192                    // When `validate=true`, drop decoded chunks containing
193                    // NUL bytes - these are typically buggy-decoder output
194                    // (mis-decoded binary, broken-encoded base64) and feed
195                    // garbage into downstream regex scanning. C1 controls
196                    // (0x80-0x9F) are kept because legitimate UTF-8 multi-
197                    // byte sequences include those bytes.
198                    if validate && decoded.data.as_bytes().contains(&0u8) {
199                        continue;
200                    }
201                    let passes_screen = if let Some(screen) = screen {
202                        screen.screen(decoded.data.as_bytes())
203                    } else {
204                        true
205                    };
206
207                    // Count this unique decoded chunk against the fan-out
208                    // budget REGARDLESS of screen result (M2): a chunk that
209                    // fails the screen is still queued and recursively
210                    // re-decoded, so it must consume the recursion budget.
211                    produced += 1;
212                    total_bytes += decoded.data.len();
213                    if produced > MAX_DECODED_CHUNKS_PER_ROOT
214                        || total_bytes > MAX_DECODED_TOTAL_BYTES
215                    {
216                        // Demoted from `warn!` - hitting the recursive
217                        // decode limit is a benign cap, not an error.
218                        // Files with dense nested encoding (audit logs,
219                        // sealed blobs, base64-of-base64-of-zlib...)
220                        // trip it routinely on every scan, which made
221                        // routine output (e.g. `keyhog scan ~/.config`)
222                        // look like the scanner was failing. Real
223                        // scanner failures use `warn!`/`error!`.
224                        tracing::debug!(
225                            path = ?chunk.metadata.path,
226                            "decode depth/size cap reached: chunk truncated to limit"
227                        );
228                        return decoded_chunks;
229                    }
230
231                    queue.push_back((decoded.clone(), depth + 1));
232                    if passes_screen {
233                        decoded_chunks.push(decoded);
234                    }
235                }
236            }
237        }
238    }
239    decoded_chunks
240}
241
242pub(super) fn push_decoded_text_chunk(
243    decoded_chunks: &mut Vec<Chunk>,
244    chunk: &Chunk,
245    text: String,
246    decoder_name: &str,
247) {
248    // Legacy entrypoint with no source-blob info. Forwards to the
249    // splice-aware variant with `original_encoded = ""`, which falls
250    // back to the old "decoded text alone" chunk shape. New decoders
251    // should call `push_decoded_text_chunk_spliced` so the parent's
252    // companion context lands adjacent to the decoded credential.
253    push_decoded_text_chunk_spliced(decoded_chunks, chunk, "", text, decoder_name);
254}
255
256/// Push a decoded chunk that **splices** the decoded text back into
257/// the parent at the position of the original encoded blob. This
258/// keeps the parent's companion context (the `aws_secret =` /
259/// `Authorization: Bearer` / `api_key:` anchors) adjacent to the
260/// decoded credential, which is what detector regexes need to fire.
261///
262/// Pass an empty `original_encoded` to fall back to the legacy
263/// "decoded text alone" behavior.
264///
265/// Why this exists
266/// ---------------
267/// Before the splice path, `push_decoded_text_chunk` always emitted
268/// the decoded bytes in a brand-new chunk with NO surrounding text.
269/// The `encoding_explosion_runner` (tests/encoding_explosion_runner.rs)
270/// surfaced the resulting recall gap: base64/hex/url-percent
271/// encodings recovered only ~30% of contract credentials because
272/// every companion-anchored detector lost its anchor when the chunk
273/// was reduced to a bare decoded string. Splicing preserves the
274/// anchor and is the single biggest decode-through recall lever.
275pub(super) fn push_decoded_text_chunk_spliced(
276    decoded_chunks: &mut Vec<Chunk>,
277    chunk: &Chunk,
278    original_encoded: &str,
279    text: String,
280    decoder_name: &str,
281) {
282    // Fast ASCII check: control chars are always in 0x00-0x1F range.
283    // Byte-level iteration avoids UTF-8 decode overhead.
284    let bytes = text.as_bytes();
285    if text.is_empty()
286        || bytes
287            .iter()
288            .any(|&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
289    {
290        return;
291    }
292
293    // Build the new chunk's payload. Default: just the decoded text
294    // (legacy shape). If we know the original encoded blob AND it
295    // appears in the parent, splice the decoded text in at the first
296    // occurrence so the companion context survives. Cap the splice
297    // path on chunk size so a multi-MB parent doesn't blow memory.
298    const MAX_SPLICE_PARENT_BYTES: usize = 256 * 1024;
299    let (base_offset, payload) = if !original_encoded.is_empty()
300        && chunk.data.len() <= MAX_SPLICE_PARENT_BYTES
301    {
302        match splice_decoded_payload(chunk.data.as_str(), original_encoded, &text, decoder_name) {
303            // The decoded credential now sits `win_start` bytes into the
304            // windowed payload's parent slice, so shift base_offset to keep
305            // the reported file offset anchored to the real position.
306            Some((win_start, spliced)) => (
307                chunk.metadata.base_offset.saturating_add(win_start),
308                spliced,
309            ),
310            None => (chunk.metadata.base_offset, text),
311        }
312    } else {
313        (chunk.metadata.base_offset, text)
314    };
315
316    decoded_chunks.push(Chunk {
317        data: payload.into(),
318        metadata: ChunkMetadata {
319            // Defect #80 (root cause D): decoded-chunk findings used to
320            // report `offset: 0` regardless of where the encoded blob
321            // sat in the parent file - a Z85-decoded credential at
322            // offset 166332 of a 156955-byte file is meaningless to
323            // anyone trying to navigate to it. Inherit the parent's
324            // `base_offset` so the reported file offset is at least
325            // anchored to the parent window/file, not the decoded
326            // synthetic stream. Per-blob precision (offset OF the
327            // encoded blob in parent) would need `extract_encoded_values`
328            // to return positions too - a follow-up. This is strictly
329            // closer to the truth. When splicing succeeds we additionally
330            // shift by the context-window start so the offset points near the
331            // blob's real position, not just the parent's origin.
332            base_offset,
333            // Inherit the parent window's base line so a line reported on a
334            // decoded chunk from a >window_size file stays anchored to the
335            // parent window, exactly as base_offset is inherited above. 0 for
336            // non-windowed parents.
337            base_line: chunk.metadata.base_line,
338            source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
339            path: chunk.metadata.path.clone(),
340            commit: chunk.metadata.commit.clone(),
341            author: chunk.metadata.author.clone(),
342            date: chunk.metadata.date.clone(),
343            // Decoded chunks inherit the parent's metadata; mtime/size
344            // are deliberately copied so the orchestrator's cache key
345            // tracks the underlying file even after a decode pass.
346            mtime_ns: chunk.metadata.mtime_ns,
347            size_bytes: chunk.metadata.size_bytes,
348        },
349    });
350}
351
352/// Bytes of surrounding parent text kept on each side of the spliced-in
353/// decoded credential. The splice exists ONLY to keep the decoded value's
354/// companion anchor (assignment key / `Authorization:` header / `api_key=`
355/// prefix) adjacent so companion-anchored detectors still fire. That anchor
356/// always sits within a line or two of the credential, so a few hundred bytes
357/// of context on each side is plenty.
358///
359/// Why this is bounded (perf, not cosmetics): the previous implementation
360/// spliced the decoded text into a copy of the ENTIRE parent, producing one
361/// parent-sized decoded chunk PER candidate. On a 156 KB source file with
362/// ~1800 splice candidates (every quoted string / `key=value` / hex/base64
363/// run) that spawned ~280 MB of decoded chunks - each then rescanned by the
364/// full engine and recursively re-decoded - an O(candidates × file_size)
365/// blowup that pinned a single b43/main.c scan at ~15s. Windowing makes each
366/// spliced chunk O(window), turning the whole pass linear. Recall is
367/// unaffected because no detector reaches across hundreds of bytes for its
368/// anchor.
369const SPLICE_CONTEXT_WINDOW: usize = 512;
370
371/// Round `idx` down to the nearest UTF-8 char boundary in `s` (stable-Rust
372/// stand-in for the unstable `str::floor_char_boundary`). Used to snap the
373/// splice context window so it never slices a multi-byte codepoint.
374fn floor_char_boundary(s: &str, mut idx: usize) -> usize {
375    if idx >= s.len() {
376        return s.len();
377    }
378    while idx > 0 && !s.is_char_boundary(idx) {
379        idx -= 1;
380    }
381    idx
382}
383
384fn ceil_char_boundary(s: &str, mut idx: usize) -> usize {
385    if idx >= s.len() {
386        return s.len();
387    }
388    while idx < s.len() && !s.is_char_boundary(idx) {
389        idx += 1;
390    }
391    idx
392}
393
394/// Returns `(window_start, payload)` where `window_start` is the byte offset
395/// in `parent` at which `payload` begins, so the caller can keep the reported
396/// finding offset anchored to the real file position.
397fn splice_decoded_payload(
398    parent: &str,
399    original_encoded: &str,
400    decoded_text: &str,
401    decoder_name: &str,
402) -> Option<(usize, String)> {
403    let start = parent.find(original_encoded)?;
404    let mut end = start + original_encoded.len();
405
406    if decoder_name == "base64" {
407        end = consume_adjacent_base64_padding(parent.as_bytes(), end);
408    }
409
410    // Keep only a bounded window of parent context around the encoded blob.
411    let win_start = floor_char_boundary(parent, start.saturating_sub(SPLICE_CONTEXT_WINDOW));
412    let win_end = ceil_char_boundary(parent, end.saturating_add(SPLICE_CONTEXT_WINDOW));
413
414    let mut payload =
415        String::with_capacity((win_end - win_start) - (end - start) + decoded_text.len());
416    payload.push_str(&parent[win_start..start]);
417    payload.push_str(decoded_text);
418    payload.push_str(&parent[end..win_end]);
419    Some((win_start, payload))
420}
421
422fn consume_adjacent_base64_padding(parent: &[u8], start: usize) -> usize {
423    let mut end = start;
424    while end < parent.len() && parent[end] == b'=' && end - start < 2 {
425        end += 1;
426    }
427    if end == start {
428        return start;
429    }
430    match parent.get(end).copied() {
431        None | Some(b'\n' | b'\r' | b'\t' | b' ' | b';' | b',' | b'"' | b'\'' | b'`') => end,
432        _ => start,
433    }
434}
435
436pub(super) fn decode_candidates<F>(
437    chunk: &Chunk,
438    candidates: Vec<String>,
439    mut decode: F,
440    decoder_name: &str,
441) -> Vec<Chunk>
442where
443    F: FnMut(&str) -> Result<String, ()>,
444{
445    let mut decoded_chunks = Vec::new();
446    for candidate in candidates {
447        if let Ok(text) = decode(&candidate) {
448            // Splice each decoded value back over its original
449            // candidate string in the parent - keeps companion
450            // context (assignment keys, format-specific anchors)
451            // adjacent to the decoded credential. Same recall-gap
452            // fix as base64/hex/json.
453            push_decoded_text_chunk_spliced(
454                &mut decoded_chunks,
455                chunk,
456                &candidate,
457                text,
458                decoder_name,
459            );
460        }
461    }
462    decoded_chunks
463}
464
465mod extractor;
466pub(super) use extractor::{extract_encoded_values, hash_fast};
keyhog_scanner/decode/pipeline.rs

keyhog_scanner/decode/
pipeline.rs