Skip to main content

keyhog_scanner/decode/
caesar.rs

1use super::pipeline::{extract_encoded_values, push_decoded_text_chunk};
2use super::Decoder;
3use aho_corasick::AhoCorasick;
4use keyhog_core::Chunk;
5use std::sync::LazyLock;
6
7/// Caesar/ROT13/ROT-N decoder. A handful of malware-config dumps and CTF
8/// fixtures store their tokens ROT13'd (`AKIA...` → `NXVN...`). For every
9/// candidate ≥ 16 chars, emit decoded variants for the 25 non-trivial Caesar
10/// shifts that produce a *plausibly credential-shaped* string.
11///
12/// "Plausibly shaped" gates the explosion: a 100-char chunk would otherwise
13/// produce 25 sibling chunks per candidate. We require:
14///   1. The decoded variant contains ≥1 ASCII digit (most modern API key
15///      formats include digits - pure-letter Caesar output rarely indicates
16///      a real secret).
17///   2. The decoded variant has at least 8 ASCII alphanumeric chars in a
18///      contiguous run (matches AWS / GitHub / Slack token shapes).
19///
20/// Both checks together keep the chunk count flat on prose-heavy inputs.
21///
22/// Source-code files are skipped entirely. Real secrets are never Caesar-
23/// encoded inside source - the 25-shift fan-out on every prose-comment in
24/// a codebase just hallucinates detector matches from random letter runs
25/// (helicone-api-key on a `//! Source trait` doc comment was the original
26/// reproducer; see dogfood-2026-05-21.md finding #5).
27pub struct CaesarDecoder;
28
29const MIN_CAESAR_LEN: usize = 16;
30const MIN_ALNUM_RUN: usize = 8;
31
32/// Aho-Corasick over the "rotated known-prefix" needle set: for every
33/// [`crate::confidence::KNOWN_PREFIXES`] entry `P` and every non-trivial shift
34/// `k` in `1..=25`, the string `caesar_shift(P, 26 - k)` — i.e. `P` with its
35/// ASCII letters rotated BACKWARD by `k` (digits / punctuation fixed).
36///
37/// SOUNDNESS (recall-exact, not merely a superset). `caesar_shift(_, k)` is a
38/// position-wise bijection on a string, so for any candidate `c`:
39///   `caesar_shift(c, k).contains(P)`  ⟺  `c.contains(caesar_shift(P, 26 - k))`.
40/// Therefore "some shift in `1..=25` of `c` contains some known prefix" is
41/// EXACTLY "`c` contains some needle in this automaton". The final gate inside
42/// [`looks_credential_shaped`] is precisely that `KNOWN_PREFIXES` substring
43/// test, and its other two gates (≥1 digit, an 8+ alphanumeric run) are
44/// shift-invariant and checked once by [`candidate_shape_invariant`]. So a
45/// candidate that matches NO needle here can never produce a credential-shaped
46/// variant under any shift — its entire 25× `caesar_shift` fan-out + re-scan is
47/// provably dead work and is skipped with zero recall loss. This replaces the
48/// unsound "longest alphabetic run ≥ 16" gate (a `0x` / `SG.` / `hf_` prefix
49/// needs only a 1–2 letter run, so a credential-shaped shift can arise from a
50/// chunk with no long alphabetic run). See `perf_decode_caesar.rs`.
51static ROTATED_PREFIX_AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
52    let mut needles: Vec<String> = Vec::new();
53    for prefix in crate::confidence::KNOWN_PREFIXES {
54        for k in 1..=25u8 {
55            // rot_{-k}(P) == caesar_shift(P, 26 - k); k in 1..=25 => 26-k in 1..=25.
56            needles.push(caesar_shift(prefix, 26 - k));
57        }
58    }
59    AhoCorasick::new(&needles).ok()
60});
61
62/// File extensions where Caesar-decoding is pure noise. Matched against the
63/// suffix of `chunk.metadata.path` (lower-cased). Kept short - only the
64/// dominant source-code extensions a scanner is realistically pointed at.
65const SOURCE_CODE_EXTENSIONS: &[&str] = &[
66    ".rs", ".py", ".go", ".js", ".jsx", ".ts", ".tsx", ".java", ".kt", ".scala", ".c", ".cc",
67    ".cpp", ".cxx", ".h", ".hh", ".hpp", ".cs", ".rb", ".php", ".swift", ".m", ".mm", ".sh",
68    ".bash", ".zsh", ".fish", ".lua", ".pl", ".pm", ".sql", ".html", ".htm", ".css", ".scss",
69    ".sass", ".vue", ".svelte", ".md", ".rst", ".txt", ".adoc", ".tbl", ".mk", ".cmake",
70];
71
72const SOURCE_CODE_FILENAMES: &[&str] = &["kconfig", "makefile", "cmakelists.txt"];
73
74pub fn is_source_code_path(path: Option<&str>) -> bool {
75    let Some(p) = path else { return false };
76    let lower = p.replace('\\', "/").to_ascii_lowercase();
77    if let Some(file_name) = lower.rsplit('/').next() {
78        if SOURCE_CODE_FILENAMES.contains(&file_name) {
79            return true;
80        }
81    }
82    SOURCE_CODE_EXTENSIONS
83        .iter()
84        .any(|ext| lower.ends_with(ext))
85}
86
87/// True when `line` contains a `scheme://user:pass@host` URL with embedded
88/// credentials. The plaintext URL itself is the credential; Caesar /
89/// ROT-N decoding cannot reveal anything new, and (worse) the 25-shift
90/// emission produces a high-confidence decoded chunk whose body wins the
91/// per-line resolution group over the real connection-string detector.
92///
93/// Match shape: `<scheme>://[^/@\s]+:[^/@\s]+@[^\s]+`. The presence of
94/// `:` between scheme and `@` is what distinguishes a credentialled URL
95/// (`postgres://u:p@h`) from a bare host URL (`https://example.com`) -
96/// the bare-host case has no credential to lose, so we leave it alone.
97pub(crate) fn line_has_credential_url(line: &str) -> bool {
98    let Some(scheme_end) = line.find("://") else {
99        return false;
100    };
101    // Scheme must be 2+ alphabetic bytes immediately before `://`.
102    let scheme_bytes = line[..scheme_end].as_bytes();
103    let scheme_ok = scheme_bytes.len() >= 2
104        && scheme_bytes
105            .iter()
106            .rev()
107            .take_while(|b| b.is_ascii_alphabetic() || **b == b'+')
108            .count()
109            >= 2;
110    if !scheme_ok {
111        return false;
112    }
113    let rest = &line[scheme_end + 3..];
114    // Walk userinfo: bytes up to the FIRST `/` or whitespace. The first
115    // `@` in that span splits user[:pass]@host. Require a `:` BEFORE the
116    // `@` so we only match URLs with embedded passwords.
117    let userinfo_end = rest
118        .find(|c: char| c == '/' || c == '?' || c == '#' || c.is_ascii_whitespace())
119        .unwrap_or(rest.len());
120    let userinfo = &rest[..userinfo_end];
121    let Some(at_pos) = userinfo.find('@') else {
122        return false;
123    };
124    userinfo[..at_pos].contains(':')
125}
126
127impl Decoder for CaesarDecoder {
128    fn name(&self) -> &'static str {
129        "caesar"
130    }
131
132    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
133        // Refuse to recurse on our own output: shifting all 25 non-trivial
134        // shifts on a previous output's would re-shift back to the original
135        // (one of those 25 covers it) and trip evasion-aware downstream
136        // logic. One pass per input is enough.
137        if chunk.metadata.source_type.contains("/caesar") {
138            return Vec::new();
139        }
140        if is_source_code_path(chunk.metadata.path.as_deref()) {
141            return Vec::new();
142        }
143        let mut out = Vec::new();
144        // Skip Caesar on chunks whose lines already carry a URL with
145        // embedded credentials (`scheme://user:pass@host`). Every db
146        // connection-string URL is plaintext-readable already, so the
147        // 25-shift fan-out cannot reveal new information; its only
148        // observed effect is to emit a high-confidence garbage finding
149        // whose decoded body out-resolves the real URL match during the
150        // per-line resolution group. Investigator empirically attributed
151        // the postgres / mongo log-line + .env database FNs to this
152        // exact resolution loss. Gate per-line so a chunk that mixes
153        // URL traffic with Caesar-encoded creds elsewhere still gets
154        // the decoder where it matters.
155        let chunk_has_credential_url = chunk.data.lines().any(line_has_credential_url);
156        if chunk_has_credential_url {
157            return Vec::new();
158        }
159        for candidate in extract_encoded_values(&chunk.data) {
160            if candidate.len() < MIN_CAESAR_LEN {
161                continue;
162            }
163            // SHIFT-INVARIANT PRECONDITION (sound; a true superset of "some
164            // shift is credential-shaped"). `caesar_shift` maps letter->letter,
165            // digit->digit, other->other, so the two structural gates inside
166            // `looks_credential_shaped` are identical for the candidate and ALL
167            // 25 of its shifts:
168            //   * "contains >=1 ASCII digit"        - digits are shift-identity
169            //   * "has an 8+ ASCII-ALPHANUMERIC run" - alnum-ness is preserved
170            // If the RAW candidate fails either gate, NONE of its 25 shifts can
171            // pass `looks_credential_shaped`, so we skip the entire 25x
172            // `caesar_shift` allocation + re-scan loop for it. Only the
173            // KNOWN_PREFIXES check (the one gate a shift CAN newly satisfy) is
174            // left to the per-shift loop. This is byte-for-byte recall-
175            // equivalent - it removes pure-waste allocations, it does not gate
176            // out any shift that could have been shaped (unlike an
177            // alphabetic-run length gate, which is unsound: a `0x`/`SG.`/`hf_`
178            // prefix needs only a 1-2 letter run, so a credential-shaped shift
179            // can arise from a chunk with no long alphabetic run at all).
180            if !candidate_shape_invariant(&candidate) {
181                continue;
182            }
183            // Rotated-prefix prefilter: a credential-shaped variant's final gate
184            // is a KNOWN_PREFIXES substring in the SHIFTED text. Because a shift
185            // is a position-wise bijection, that is equivalent to the RAW
186            // candidate containing a `rot_{-k}(prefix)` needle for some k — one
187            // Aho-Corasick pass tests all 38×25 needles at once. No needle hit
188            // means no shift can satisfy `looks_credential_shaped`, so the 25×
189            // `caesar_shift` allocation + re-scan fan-out below is skipped. This
190            // is recall-EXACT (see ROTATED_PREFIX_AC); the per-shift loop still
191            // confirms each surviving candidate via the full predicate.
192            if let Some(ac) = ROTATED_PREFIX_AC.as_ref() {
193                if !ac.is_match(candidate.as_str()) {
194                    continue;
195                }
196            }
197            for shift in 1..=25u8 {
198                let decoded = caesar_shift(&candidate, shift);
199                if !looks_credential_shaped(&decoded) {
200                    continue;
201                }
202                // NOTE: we intentionally use the non-spliced push.
203                // Splicing the decoded variant back into the parent
204                // (which the base64/hex paths do for companion-anchor
205                // preservation) is wrong for Caesar: Caesar produces
206                // 25 candidate shifts per blob, of which several can
207                // randomly satisfy hex/UUID shape gates. Splicing
208                // those into the parent multiplies findings under
209                // keyword-anchored detectors with shifted credentials
210                // that don't match the ground-truth value the user
211                // planted. Caesar's value is the bare decoded
212                // candidate; let it surface as its own chunk so the
213                // dedup layer can collapse identical findings.
214                push_decoded_text_chunk(&mut out, chunk, decoded, self.name());
215            }
216        }
217        out
218    }
219}
220
221/// Shift-invariant half of `looks_credential_shaped`, evaluated ONCE on the raw
222/// candidate before the 25x shift loop. A Caesar/ROT-N shift is a permutation
223/// within the letters and the identity on digits/punctuation, so both of these
224/// gates produce the SAME answer for the candidate and for every one of its 25
225/// shifts:
226///   1. at least one ASCII digit (digits are never moved by a shift), and
227///   2. an 8+ contiguous ASCII-alphanumeric run (alphanumeric-ness of each
228///      byte is preserved under a shift).
229/// If the raw candidate fails either, no shift can satisfy
230/// `looks_credential_shaped`, so the whole 25-allocation fan-out for that
231/// candidate is pure waste and is skipped. This is a true SUPERSET of the
232/// per-shift `looks_credential_shaped` predicate (it only ever short-circuits
233/// candidates that would have produced zero shaped shifts), so it is exactly
234/// recall-preserving. It deliberately does NOT pre-check the KNOWN_PREFIXES
235/// substring - that is the one gate a shift CAN newly satisfy by rotating
236/// letters into a prefix (e.g. `BLJB`+25 -> `AKIA`), so it stays in the loop.
237fn candidate_shape_invariant(s: &str) -> bool {
238    let bytes = s.as_bytes();
239    if !bytes.iter().any(|b| b.is_ascii_digit()) {
240        return false;
241    }
242    // Must also contain at least one letter for any shift to do anything.
243    if !bytes.iter().any(|b| b.is_ascii_alphabetic()) {
244        return false;
245    }
246    let mut run = 0usize;
247    for &b in bytes {
248        if b.is_ascii_alphanumeric() {
249            run += 1;
250            if run >= MIN_ALNUM_RUN {
251                return true;
252            }
253        } else {
254            run = 0;
255        }
256    }
257    false
258}
259
260pub fn caesar_shift(input: &str, shift: u8) -> String {
261    let mut out = String::with_capacity(input.len());
262    for ch in input.chars() {
263        let shifted = match ch {
264            'A'..='Z' => {
265                let base = b'A';
266                let off = (ch as u8 - base + shift) % 26;
267                (base + off) as char
268            }
269            'a'..='z' => {
270                let base = b'a';
271                let off = (ch as u8 - base + shift) % 26;
272                (base + off) as char
273            }
274            _ => ch,
275        };
276        out.push(shifted);
277    }
278    out
279}
280
281pub fn looks_credential_shaped(s: &str) -> bool {
282    let bytes = s.as_bytes();
283    if !bytes.iter().any(|b| b.is_ascii_digit()) {
284        return false;
285    }
286    let mut run = 0usize;
287    let mut saw_long_run = false;
288    for &b in bytes {
289        if b.is_ascii_alphanumeric() {
290            run += 1;
291            if run >= MIN_ALNUM_RUN {
292                saw_long_run = true;
293                break;
294            }
295        } else {
296            run = 0;
297        }
298    }
299    if !saw_long_run {
300        return false;
301    }
302    // Same rationale as `reverse::looks_reversible`: gate on a known
303    // provider prefix appearing in the decoded text. Without this, any
304    // Caesar shift of a credential-shaped input (e.g. `sk_live_...`
305    // shifted +23 → `ph_ifsb_...`) gets emitted as a decoded chunk
306    // whose substrings can incidentally collide with detector regexes
307    // (`sb_4bZ39EnIvgT...` matches the stackblitz `sb_[a-zA-Z0-9_-]{20,}`
308    // regex purely by letter coincidence). The downstream
309    // `should_suppress_named_detector_finding` bypasses the
310    // EXAMPLE / INSERT / CHANGE / REPLACE markers for `/caesar`
311    // source_types (because evasion-decoded inputs CAN legitimately
312    // be a planted-credential rotation), so the gate has to happen
313    // here at decoder-output time.
314    crate::confidence::KNOWN_PREFIXES
315        .iter()
316        .any(|prefix| s.contains(prefix))
317}