keyhog_scanner/decode/pipeline.rs
1use super::base64::{Base64Decoder, Z85Decoder};
2use super::caesar::CaesarDecoder;
3use super::hex::HexDecoder;
4use super::json::JsonDecoder;
5use super::reverse::ReverseDecoder;
6use super::url::{
7 HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
8 OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
9};
10use super::Decoder;
11use keyhog_core::{Chunk, ChunkMetadata};
12use std::collections::{HashSet, VecDeque};
13
14static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> = std::sync::OnceLock::new();
15
16const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
17const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
18/// Hard ceiling on the wall-clock time decode_chunk may spend on ONE chunk
19/// when the caller didn't pass an explicit deadline. Mitigates decode-bomb
20/// inputs (multi-layer base64 of unrelated data) that the existing
21/// MAX_DECODED_TOTAL_BYTES cap doesn't catch when each layer fits under the
22/// total budget but together blow the wall budget. Tuned generously: 50 ms
23/// is ~10x the cost of a normal chunk's full decode-through; pathological
24/// inputs hit it before the user notices.
25const DEFAULT_DECODE_WALL_BUDGET_MS: u64 = 50;
26
27fn get_decoders() -> &'static [Box<dyn Decoder>] {
28 DECODERS.get_or_init(|| {
29 vec![
30 Box::new(Base64Decoder),
31 Box::new(HexDecoder),
32 Box::new(UrlDecoder),
33 Box::new(QuotedPrintableDecoder),
34 Box::new(HtmlNamedEntityDecoder),
35 Box::new(HtmlNumericEntityDecoder),
36 Box::new(HexEscapeDecoder),
37 Box::new(OctalEscapeDecoder),
38 Box::new(MimeEncodedWordDecoder),
39 Box::new(UnicodeEscapeDecoder),
40 // JSON unescape - strips `\"` / `\\` / `\n` style escapes
41 // inside JSON string values so credentials stored as
42 // JSON-encoded fields (the most common shape after .env)
43 // survive into the scanner. Originally implemented but
44 // never registered - the adversarial_explosion_runner's
45 // `json` wrapper class surfaced ~73 misses that wiring
46 // this in closed (5792/5792 variants now fire).
47 Box::new(JsonDecoder),
48 Box::new(Z85Decoder),
49 Box::new(ReverseDecoder),
50 Box::new(CaesarDecoder),
51 ]
52 })
53}
54
55/// Register a custom decoder. Must be called BEFORE any scan runs.
56/// Panics if the decoder list has already been initialized.
57pub fn register_decoder(decoder: Box<dyn Decoder>) {
58 // After initialization, the decoder list is immutable for lock-free reads.
59 // Custom decoders must be registered before the first scan.
60 if DECODERS.get().is_some() {
61 tracing::warn!("register_decoder called after initialization: decoder ignored. Fix: register custom decoders before scanning.");
62 return;
63 }
64 // KEEP THIS LIST IN SYNC with `get_decoders()` above - they're
65 // two paths to the same initialized state, and a decoder missing
66 // here would silently vanish from any custom-decoder-registered
67 // run.
68 let mut decoders: Vec<Box<dyn Decoder>> = vec![
69 Box::new(Base64Decoder),
70 Box::new(HexDecoder),
71 Box::new(UrlDecoder),
72 Box::new(QuotedPrintableDecoder),
73 Box::new(HtmlNamedEntityDecoder),
74 Box::new(HtmlNumericEntityDecoder),
75 Box::new(HexEscapeDecoder),
76 Box::new(OctalEscapeDecoder),
77 Box::new(MimeEncodedWordDecoder),
78 Box::new(UnicodeEscapeDecoder),
79 Box::new(JsonDecoder),
80 Box::new(Z85Decoder),
81 Box::new(ReverseDecoder),
82 Box::new(CaesarDecoder),
83 ];
84 decoders.push(decoder);
85 let _ = DECODERS.set(decoders);
86}
87
88pub fn decode_chunk(
89 chunk: &Chunk,
90 max_depth: usize,
91 validate: bool,
92 deadline: Option<std::time::Instant>,
93 screen: Option<&crate::alphabet_filter::AlphabetScreen>,
94) -> Vec<Chunk> {
95 // NOTE: a blanket `has_decodable_payload` early-out was tried here
96 // (AUD-speed-2) and reverted: that predicate only recognises base64/hex
97 // alphabet runs, but the pipeline also runs URL/percent, HTML-entity,
98 // hex/octal/unicode-escape, MIME-word, quoted-printable and JSON decoders
99 // whose triggers it does not cover. Gating the whole fan-out on it silently
100 // dropped ~7% of credentials under structured-format wrapping
101 // (`every_contract_positive_fires_under_every_format_wrapper`). A correct
102 // superset gate fires on `% & \ " { =` — which saturate real source — so it
103 // buys almost nothing; the genuine cost (Caesar's 25× fan-out over the full
104 // chunk) belongs gated at the Caesar decoder on its own alphabetic-run
105 // precondition, not as a pipeline-wide recall hazard.
106 let mut decoded_chunks = Vec::new();
107 let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
108 // Use hash of data instead of full string to save memory on large files.
109 let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
110 let mut total_bytes = 0usize;
111 // Count EVERY unique decoded chunk against the per-root fan-out cap,
112 // not just the ones that pass the alphabet screen and get returned
113 // (M2). Screen-failing chunks were still queued and recursively
114 // re-decoded but never incremented `decoded_chunks.len()`, so on the
115 // live screen-enabled path the 1000-chunk DoS guard never bound a
116 // high-fan-out decoder (Caesar emits up to 25 variants/candidate,
117 // most failing the screen). The screen decides whether a chunk is
118 // RETURNED for scanning; this counter decides the recursion budget.
119 let mut produced = 0usize;
120
121 let registry = get_decoders();
122
123 // Per-chunk wall-clock ceiling. Always apply the TIGHTER of the
124 // caller-supplied `deadline` and our own `DEFAULT_DECODE_WALL_BUDGET_MS`
125 // ceiling. kimi-wave1 audit finding 5.2: previously the caller's
126 // (long) scan deadline overrode this guard, letting a decode-bomb
127 // chunk consume the entire scan budget.
128 let local_ceiling =
129 std::time::Instant::now() + std::time::Duration::from_millis(DEFAULT_DECODE_WALL_BUDGET_MS);
130 let effective_deadline = match deadline {
131 Some(d) => d.min(local_ceiling),
132 None => local_ceiling,
133 };
134
135 while let Some((current, depth)) = queue.pop_front() {
136 if std::time::Instant::now() > effective_deadline {
137 tracing::debug!(
138 path = ?chunk.metadata.path,
139 budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
140 "decode budget exhausted; stopping decode-through"
141 );
142 break;
143 }
144 if depth >= max_depth {
145 continue;
146 }
147
148 for decoder in registry.iter() {
149 // Re-check the wall-clock budget BEFORE each decoder's
150 // candidate fan-out (C9). The top-of-loop check only fires
151 // once per BFS dequeue, so a single chunk could run all 14
152 // decoders to completion with no budget check, blowing far past
153 // DEFAULT_DECODE_WALL_BUDGET_MS on one chunk. This check stops us
154 // from even invoking the next decoder once the deadline trips;
155 // the matching check inside the inner loop below stops us
156 // consuming the CURRENT decoder's (un-bounded) output.
157 if std::time::Instant::now() > effective_deadline {
158 tracing::debug!(
159 path = ?chunk.metadata.path,
160 budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
161 "decode budget exhausted mid-fan-out; stopping decode-through"
162 );
163 return decoded_chunks;
164 }
165 for decoded in decoder.decode_chunk(¤t) {
166 // Re-check the budget WHILE consuming this decoder's output
167 // (C9 root cause). The pre-decoder check above only fires
168 // once per decoder, but `decode_chunk` returns a fully
169 // materialized Vec whose length is O(chunk size) -
170 // `extract_encoded_values` yields one candidate per quoted
171 // string / `key=value` / base64 run, and Caesar fans each out
172 // 25x. Without this check the pipeline still hashes, screens,
173 // clones, and queues every one of those results AFTER the
174 // deadline has passed, so a single dense chunk's fan-out
175 // (tens of thousands of results) ran the per-result work to
176 // completion regardless of the wall budget. The
177 // `decoder.decode_chunk` call itself cannot be interrupted
178 // (trait returns an owned Vec), but bailing here bounds the
179 // post-deadline overrun to one decoder's fan-out at most -
180 // and stops the (dominant) per-result processing cost dead.
181 if std::time::Instant::now() > effective_deadline {
182 tracing::debug!(
183 path = ?chunk.metadata.path,
184 budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
185 "decode budget exhausted while consuming decoder output; \
186 stopping decode-through"
187 );
188 return decoded_chunks;
189 }
190 if seen.insert(hash_fast(decoded.data.as_bytes())) {
191 // Optional sanitization (kimi-wave1 audit finding 5.1).
192 // When `validate=true`, drop decoded chunks containing
193 // NUL bytes - these are typically buggy-decoder output
194 // (mis-decoded binary, broken-encoded base64) and feed
195 // garbage into downstream regex scanning. C1 controls
196 // (0x80-0x9F) are kept because legitimate UTF-8 multi-
197 // byte sequences include those bytes.
198 if validate && decoded.data.as_bytes().contains(&0u8) {
199 continue;
200 }
201 let passes_screen = if let Some(screen) = screen {
202 screen.screen(decoded.data.as_bytes())
203 } else {
204 true
205 };
206
207 // Count this unique decoded chunk against the fan-out
208 // budget REGARDLESS of screen result (M2): a chunk that
209 // fails the screen is still queued and recursively
210 // re-decoded, so it must consume the recursion budget.
211 produced += 1;
212 total_bytes += decoded.data.len();
213 if produced > MAX_DECODED_CHUNKS_PER_ROOT
214 || total_bytes > MAX_DECODED_TOTAL_BYTES
215 {
216 // Demoted from `warn!` - hitting the recursive
217 // decode limit is a benign cap, not an error.
218 // Files with dense nested encoding (audit logs,
219 // sealed blobs, base64-of-base64-of-zlib...)
220 // trip it routinely on every scan, which made
221 // routine output (e.g. `keyhog scan ~/.config`)
222 // look like the scanner was failing. Real
223 // scanner failures use `warn!`/`error!`.
224 tracing::debug!(
225 path = ?chunk.metadata.path,
226 "decode depth/size cap reached: chunk truncated to limit"
227 );
228 return decoded_chunks;
229 }
230
231 queue.push_back((decoded.clone(), depth + 1));
232 if passes_screen {
233 decoded_chunks.push(decoded);
234 }
235 }
236 }
237 }
238 }
239 decoded_chunks
240}
241
242pub(super) fn push_decoded_text_chunk(
243 decoded_chunks: &mut Vec<Chunk>,
244 chunk: &Chunk,
245 text: String,
246 decoder_name: &str,
247) {
248 // Legacy entrypoint with no source-blob info. Forwards to the
249 // splice-aware variant with `original_encoded = ""`, which falls
250 // back to the old "decoded text alone" chunk shape. New decoders
251 // should call `push_decoded_text_chunk_spliced` so the parent's
252 // companion context lands adjacent to the decoded credential.
253 push_decoded_text_chunk_spliced(decoded_chunks, chunk, "", text, decoder_name);
254}
255
256/// Push a decoded chunk that **splices** the decoded text back into
257/// the parent at the position of the original encoded blob. This
258/// keeps the parent's companion context (the `aws_secret =` /
259/// `Authorization: Bearer` / `api_key:` anchors) adjacent to the
260/// decoded credential, which is what detector regexes need to fire.
261///
262/// Pass an empty `original_encoded` to fall back to the legacy
263/// "decoded text alone" behavior.
264///
265/// Why this exists
266/// ---------------
267/// Before the splice path, `push_decoded_text_chunk` always emitted
268/// the decoded bytes in a brand-new chunk with NO surrounding text.
269/// The `encoding_explosion_runner` (tests/encoding_explosion_runner.rs)
270/// surfaced the resulting recall gap: base64/hex/url-percent
271/// encodings recovered only ~30% of contract credentials because
272/// every companion-anchored detector lost its anchor when the chunk
273/// was reduced to a bare decoded string. Splicing preserves the
274/// anchor and is the single biggest decode-through recall lever.
275pub(super) fn push_decoded_text_chunk_spliced(
276 decoded_chunks: &mut Vec<Chunk>,
277 chunk: &Chunk,
278 original_encoded: &str,
279 text: String,
280 decoder_name: &str,
281) {
282 // Fast ASCII check: control chars are always in 0x00-0x1F range.
283 // Byte-level iteration avoids UTF-8 decode overhead.
284 let bytes = text.as_bytes();
285 if text.is_empty()
286 || bytes
287 .iter()
288 .any(|&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
289 {
290 return;
291 }
292
293 // Build the new chunk's payload. Default: just the decoded text
294 // (legacy shape). If we know the original encoded blob AND it
295 // appears in the parent, splice the decoded text in at the first
296 // occurrence so the companion context survives. Cap the splice
297 // path on chunk size so a multi-MB parent doesn't blow memory.
298 const MAX_SPLICE_PARENT_BYTES: usize = 256 * 1024;
299 let (base_offset, payload) = if !original_encoded.is_empty()
300 && chunk.data.len() <= MAX_SPLICE_PARENT_BYTES
301 {
302 match splice_decoded_payload(chunk.data.as_str(), original_encoded, &text, decoder_name) {
303 // The decoded credential now sits `win_start` bytes into the
304 // windowed payload's parent slice, so shift base_offset to keep
305 // the reported file offset anchored to the real position.
306 Some((win_start, spliced)) => (
307 chunk.metadata.base_offset.saturating_add(win_start),
308 spliced,
309 ),
310 None => (chunk.metadata.base_offset, text),
311 }
312 } else {
313 (chunk.metadata.base_offset, text)
314 };
315
316 decoded_chunks.push(Chunk {
317 data: payload.into(),
318 metadata: ChunkMetadata {
319 // Defect #80 (root cause D): decoded-chunk findings used to
320 // report `offset: 0` regardless of where the encoded blob
321 // sat in the parent file - a Z85-decoded credential at
322 // offset 166332 of a 156955-byte file is meaningless to
323 // anyone trying to navigate to it. Inherit the parent's
324 // `base_offset` so the reported file offset is at least
325 // anchored to the parent window/file, not the decoded
326 // synthetic stream. Per-blob precision (offset OF the
327 // encoded blob in parent) would need `extract_encoded_values`
328 // to return positions too - a follow-up. This is strictly
329 // closer to the truth. When splicing succeeds we additionally
330 // shift by the context-window start so the offset points near the
331 // blob's real position, not just the parent's origin.
332 base_offset,
333 // Inherit the parent window's base line so a line reported on a
334 // decoded chunk from a >window_size file stays anchored to the
335 // parent window, exactly as base_offset is inherited above. 0 for
336 // non-windowed parents.
337 base_line: chunk.metadata.base_line,
338 source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
339 path: chunk.metadata.path.clone(),
340 commit: chunk.metadata.commit.clone(),
341 author: chunk.metadata.author.clone(),
342 date: chunk.metadata.date.clone(),
343 // Decoded chunks inherit the parent's metadata; mtime/size
344 // are deliberately copied so the orchestrator's cache key
345 // tracks the underlying file even after a decode pass.
346 mtime_ns: chunk.metadata.mtime_ns,
347 size_bytes: chunk.metadata.size_bytes,
348 },
349 });
350}
351
352/// Bytes of surrounding parent text kept on each side of the spliced-in
353/// decoded credential. The splice exists ONLY to keep the decoded value's
354/// companion anchor (assignment key / `Authorization:` header / `api_key=`
355/// prefix) adjacent so companion-anchored detectors still fire. That anchor
356/// always sits within a line or two of the credential, so a few hundred bytes
357/// of context on each side is plenty.
358///
359/// Why this is bounded (perf, not cosmetics): the previous implementation
360/// spliced the decoded text into a copy of the ENTIRE parent, producing one
361/// parent-sized decoded chunk PER candidate. On a 156 KB source file with
362/// ~1800 splice candidates (every quoted string / `key=value` / hex/base64
363/// run) that spawned ~280 MB of decoded chunks - each then rescanned by the
364/// full engine and recursively re-decoded - an O(candidates × file_size)
365/// blowup that pinned a single b43/main.c scan at ~15s. Windowing makes each
366/// spliced chunk O(window), turning the whole pass linear. Recall is
367/// unaffected because no detector reaches across hundreds of bytes for its
368/// anchor.
369const SPLICE_CONTEXT_WINDOW: usize = 512;
370
371/// Round `idx` down to the nearest UTF-8 char boundary in `s` (stable-Rust
372/// stand-in for the unstable `str::floor_char_boundary`). Used to snap the
373/// splice context window so it never slices a multi-byte codepoint.
374fn floor_char_boundary(s: &str, mut idx: usize) -> usize {
375 if idx >= s.len() {
376 return s.len();
377 }
378 while idx > 0 && !s.is_char_boundary(idx) {
379 idx -= 1;
380 }
381 idx
382}
383
384fn ceil_char_boundary(s: &str, mut idx: usize) -> usize {
385 if idx >= s.len() {
386 return s.len();
387 }
388 while idx < s.len() && !s.is_char_boundary(idx) {
389 idx += 1;
390 }
391 idx
392}
393
394/// Returns `(window_start, payload)` where `window_start` is the byte offset
395/// in `parent` at which `payload` begins, so the caller can keep the reported
396/// finding offset anchored to the real file position.
397fn splice_decoded_payload(
398 parent: &str,
399 original_encoded: &str,
400 decoded_text: &str,
401 decoder_name: &str,
402) -> Option<(usize, String)> {
403 let start = parent.find(original_encoded)?;
404 let mut end = start + original_encoded.len();
405
406 if decoder_name == "base64" {
407 end = consume_adjacent_base64_padding(parent.as_bytes(), end);
408 }
409
410 // Keep only a bounded window of parent context around the encoded blob.
411 let win_start = floor_char_boundary(parent, start.saturating_sub(SPLICE_CONTEXT_WINDOW));
412 let win_end = ceil_char_boundary(parent, end.saturating_add(SPLICE_CONTEXT_WINDOW));
413
414 let mut payload =
415 String::with_capacity((win_end - win_start) - (end - start) + decoded_text.len());
416 payload.push_str(&parent[win_start..start]);
417 payload.push_str(decoded_text);
418 payload.push_str(&parent[end..win_end]);
419 Some((win_start, payload))
420}
421
422fn consume_adjacent_base64_padding(parent: &[u8], start: usize) -> usize {
423 let mut end = start;
424 while end < parent.len() && parent[end] == b'=' && end - start < 2 {
425 end += 1;
426 }
427 if end == start {
428 return start;
429 }
430 match parent.get(end).copied() {
431 None | Some(b'\n' | b'\r' | b'\t' | b' ' | b';' | b',' | b'"' | b'\'' | b'`') => end,
432 _ => start,
433 }
434}
435
436pub(super) fn decode_candidates<F>(
437 chunk: &Chunk,
438 candidates: Vec<String>,
439 mut decode: F,
440 decoder_name: &str,
441) -> Vec<Chunk>
442where
443 F: FnMut(&str) -> Result<String, ()>,
444{
445 let mut decoded_chunks = Vec::new();
446 for candidate in candidates {
447 if let Ok(text) = decode(&candidate) {
448 // Splice each decoded value back over its original
449 // candidate string in the parent - keeps companion
450 // context (assignment keys, format-specific anchors)
451 // adjacent to the decoded credential. Same recall-gap
452 // fix as base64/hex/json.
453 push_decoded_text_chunk_spliced(
454 &mut decoded_chunks,
455 chunk,
456 &candidate,
457 text,
458 decoder_name,
459 );
460 }
461 }
462 decoded_chunks
463}
464
465mod extractor;
466pub(super) use extractor::{extract_encoded_values, hash_fast};