keyhog_scanner/decode/reverse.rs
1use super::pipeline::{decode_candidates, extract_encoded_values};
2use super::Decoder;
3use keyhog_core::Chunk;
4
5/// Match secrets that have been reversed character-by-character to dodge a
6/// naïve byte-substring scan. Cheap evasion the adversarial corpus
7/// (release-2026-04-26) hits multiple times - `RNK1ESEMURKWESFEDBA-46AIKA`
8/// is exactly the AWS access-key-id `AKIA-64ABDEFSEWKRUMSEK1NR` reversed.
9///
10/// The reverse decoder runs *after* the other decoders fail to match. It only
11/// emits a decoded chunk when the candidate is at least 16 chars long; below
12/// that, reversed strings collide with normal text and produce too many
13/// useless chunks for the scanner to dedup.
14pub struct ReverseDecoder;
15
16const MIN_REVERSE_LEN: usize = 16;
17
18impl Decoder for ReverseDecoder {
19 fn name(&self) -> &'static str {
20 "reverse"
21 }
22
23 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
24 // Refuse to recurse on our own output: reverse(reverse(s)) == s, so
25 // the recursive pass would emit the original credential under a
26 // `…/reverse/reverse` source_type, defeating downstream
27 // evasion-aware suppression rules and (at minimum) wasting work.
28 if chunk.metadata.source_type.contains("/reverse") {
29 return Vec::new();
30 }
31 let candidates: Vec<String> = extract_encoded_values(&chunk.data)
32 .into_iter()
33 .filter(|c| c.len() >= MIN_REVERSE_LEN)
34 .filter(|c| looks_reversible(c))
35 .collect();
36 decode_candidates(chunk, candidates, |s| Ok(reverse_str(s)), self.name())
37 }
38}
39
40pub fn reverse_str(s: &str) -> String {
41 s.chars().rev().collect()
42}
43
44/// Reverse-decode is asymmetric: every string trivially "decodes" to its
45/// reverse, so we'd emit O(N) decoy chunks for normal text. Two cheap gates:
46///
47/// 1. A 12+ ASCII alphanumeric run in the reversed direction (filters out
48/// `a-b-c-d-...` and other punctuated text).
49/// 2. The reversed text must contain at least one known credential prefix
50/// from `confidence::KNOWN_PREFIXES`. Without this, plain prose like
51/// `ABCDEFGHIJKLMNOPQRSTUVWXYZ` reverses to `ZYXWVUTSRQPONMLKJIHGFEDCBA`,
52/// passes the alphanumeric-run gate, and gets emitted as a decoy chunk
53/// on every chunk that contains a long alphanumeric word - pure noise
54/// that hammers the dedup layer. Kimi-decode audit finding #4.
55pub fn looks_reversible(candidate: &str) -> bool {
56 let bytes = candidate.as_bytes();
57 let mut run = 0usize;
58 let mut saw_long_run = false;
59 for &b in bytes.iter().rev() {
60 if b.is_ascii_alphanumeric() {
61 run += 1;
62 if run >= 12 {
63 saw_long_run = true;
64 break;
65 }
66 } else {
67 run = 0;
68 }
69 }
70 if !saw_long_run {
71 return false;
72 }
73 // Only emit a reverse-decoded chunk when the reversed string would
74 // contain a known provider prefix. Stops `ZYXWVUTSRQPONMLKJIHGFEDCBA`
75 // from looking like a candidate just because it has a long alnum run.
76 //
77 // Skip 2-char prefixes - the only entry that short is the Ethereum
78 // `0x` literal. `0x` shows up by random chance in ~1.6% of 80-char
79 // base64 strings, which routed every such reversed blob through the
80 // decoder and emitted spurious findings on the base64-protobuf
81 // decoy class. Investigator empirically attributed 4 FPs to this
82 // exact path. An Ethereum address embedded inside an obfuscated
83 // reversed string is exotic enough that the recall loss is near zero;
84 // every 3+ char vendor prefix (`hf_`, `SG.`, `eyJ`, `sk-`, `ghp_`,
85 // ...) still gates as before.
86 let reversed = reverse_str(candidate);
87 crate::confidence::KNOWN_PREFIXES
88 .iter()
89 .filter(|prefix| prefix.len() >= 3)
90 .any(|prefix| reversed.contains(prefix))
91}