Skip to main content

keyhog_scanner/decode/
caesar.rs

1use super::pipeline::{extract_encoded_values, push_decoded_text_chunk};
2use super::Decoder;
3use keyhog_core::Chunk;
4
5/// Caesar/ROT13/ROT-N decoder. A handful of malware-config dumps and CTF
6/// fixtures store their tokens ROT13'd (`AKIA...` → `NXVN...`). For every
7/// candidate ≥ 16 chars, emit decoded variants for the 25 non-trivial Caesar
8/// shifts that produce a *plausibly credential-shaped* string.
9///
10/// "Plausibly shaped" gates the explosion: a 100-char chunk would otherwise
11/// produce 25 sibling chunks per candidate. We require:
12///   1. The decoded variant contains ≥1 ASCII digit (most modern API key
13///      formats include digits - pure-letter Caesar output rarely indicates
14///      a real secret).
15///   2. The decoded variant has at least 8 ASCII alphanumeric chars in a
16///      contiguous run (matches AWS / GitHub / Slack token shapes).
17///
18/// Both checks together keep the chunk count flat on prose-heavy inputs.
19///
20/// Source-code files are skipped entirely. Real secrets are never Caesar-
21/// encoded inside source - the 25-shift fan-out on every prose-comment in
22/// a codebase just hallucinates detector matches from random letter runs
23/// (helicone-api-key on a `//! Source trait` doc comment was the original
24/// reproducer; see dogfood-2026-05-21.md finding #5).
25pub struct CaesarDecoder;
26
27const MIN_CAESAR_LEN: usize = 16;
28const MIN_ALNUM_RUN: usize = 8;
29
30/// File extensions where Caesar-decoding is pure noise. Matched against the
31/// suffix of `chunk.metadata.path` (lower-cased). Kept short - only the
32/// dominant source-code extensions a scanner is realistically pointed at.
33const SOURCE_CODE_EXTENSIONS: &[&str] = &[
34    ".rs", ".py", ".go", ".js", ".jsx", ".ts", ".tsx", ".java", ".kt", ".scala", ".c", ".cc",
35    ".cpp", ".cxx", ".h", ".hh", ".hpp", ".cs", ".rb", ".php", ".swift", ".m", ".mm", ".sh",
36    ".bash", ".zsh", ".fish", ".lua", ".pl", ".pm", ".sql", ".html", ".htm", ".css", ".scss",
37    ".sass", ".vue", ".svelte", ".md", ".rst", ".txt", ".adoc",
38];
39
40pub fn is_source_code_path(path: Option<&str>) -> bool {
41    let Some(p) = path else { return false };
42    let lower = p.to_ascii_lowercase();
43    SOURCE_CODE_EXTENSIONS
44        .iter()
45        .any(|ext| lower.ends_with(ext))
46}
47
48impl Decoder for CaesarDecoder {
49    fn name(&self) -> &'static str {
50        "caesar"
51    }
52
53    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
54        // Refuse to recurse on our own output: shifting all 25 non-trivial
55        // shifts on a previous output's would re-shift back to the original
56        // (one of those 25 covers it) and trip evasion-aware downstream
57        // logic. One pass per input is enough.
58        if chunk.metadata.source_type.contains("/caesar") {
59            return Vec::new();
60        }
61        if is_source_code_path(chunk.metadata.path.as_deref()) {
62            return Vec::new();
63        }
64        let mut out = Vec::new();
65        for candidate in extract_encoded_values(&chunk.data) {
66            if candidate.len() < MIN_CAESAR_LEN {
67                continue;
68            }
69            // kimi-decode audit: caesar_shift is the identity for
70            // digits / punctuation / non-ASCII. A pure-digit candidate
71            // (e.g. a 16-digit PIN) produces 25 IDENTICAL shifts, all
72            // equal to the original. The seen-set later dedups them
73            // but each unnecessarily walks the full detector pipeline
74            // and emits a bare decoded chunk that scans the same text
75            // we already scanned in the parent. Skip if the input has
76            // no a-z/A-Z character to shift.
77            if !candidate.chars().any(|c| c.is_ascii_alphabetic()) {
78                continue;
79            }
80            for shift in 1..=25u8 {
81                let decoded = caesar_shift(&candidate, shift);
82                if !looks_credential_shaped(&decoded) {
83                    continue;
84                }
85                // NOTE: we intentionally use the non-spliced push.
86                // Splicing the decoded variant back into the parent
87                // (which the base64/hex paths do for companion-anchor
88                // preservation) is wrong for Caesar: Caesar produces
89                // 25 candidate shifts per blob, of which several can
90                // randomly satisfy hex/UUID shape gates. Splicing
91                // those into the parent multiplies findings under
92                // keyword-anchored detectors with shifted credentials
93                // that don't match the ground-truth value the user
94                // planted. Caesar's value is the bare decoded
95                // candidate; let it surface as its own chunk so the
96                // dedup layer can collapse identical findings.
97                push_decoded_text_chunk(&mut out, chunk, decoded, self.name());
98            }
99        }
100        out
101    }
102}
103
104pub fn caesar_shift(input: &str, shift: u8) -> String {
105    let mut out = String::with_capacity(input.len());
106    for ch in input.chars() {
107        let shifted = match ch {
108            'A'..='Z' => {
109                let base = b'A';
110                let off = (ch as u8 - base + shift) % 26;
111                (base + off) as char
112            }
113            'a'..='z' => {
114                let base = b'a';
115                let off = (ch as u8 - base + shift) % 26;
116                (base + off) as char
117            }
118            _ => ch,
119        };
120        out.push(shifted);
121    }
122    out
123}
124
125pub fn looks_credential_shaped(s: &str) -> bool {
126    let bytes = s.as_bytes();
127    if !bytes.iter().any(|b| b.is_ascii_digit()) {
128        return false;
129    }
130    let mut run = 0usize;
131    let mut saw_long_run = false;
132    for &b in bytes {
133        if b.is_ascii_alphanumeric() {
134            run += 1;
135            if run >= MIN_ALNUM_RUN {
136                saw_long_run = true;
137                break;
138            }
139        } else {
140            run = 0;
141        }
142    }
143    if !saw_long_run {
144        return false;
145    }
146    // Same rationale as `reverse::looks_reversible`: gate on a known
147    // provider prefix appearing in the decoded text. Without this, any
148    // Caesar shift of a credential-shaped input (e.g. `sk_live_...`
149    // shifted +23 → `ph_ifsb_...`) gets emitted as a decoded chunk
150    // whose substrings can incidentally collide with detector regexes
151    // (`sb_4bZ39EnIvgT...` matches the stackblitz `sb_[a-zA-Z0-9_-]{20,}`
152    // regex purely by letter coincidence). The downstream
153    // `should_suppress_named_detector_finding` bypasses the
154    // EXAMPLE / INSERT / CHANGE / REPLACE markers for `/caesar`
155    // source_types (because evasion-decoded inputs CAN legitimately
156    // be a planted-credential rotation), so the gate has to happen
157    // here at decoder-output time.
158    crate::confidence::KNOWN_PREFIXES
159        .iter()
160        .any(|prefix| s.contains(prefix))
161}