keyhog_scanner/decode/caesar.rs
1use super::pipeline::{extract_encoded_values, push_decoded_text_chunk};
2use super::Decoder;
3use keyhog_core::Chunk;
4
5/// Caesar/ROT13/ROT-N decoder. A handful of malware-config dumps and CTF
6/// fixtures store their tokens ROT13'd (`AKIA...` → `NXVN...`). For every
7/// candidate ≥ 16 chars, emit decoded variants for the 25 non-trivial Caesar
8/// shifts that produce a *plausibly credential-shaped* string.
9///
10/// "Plausibly shaped" gates the explosion: a 100-char chunk would otherwise
11/// produce 25 sibling chunks per candidate. We require:
12/// 1. The decoded variant contains ≥1 ASCII digit (most modern API key
13/// formats include digits - pure-letter Caesar output rarely indicates
14/// a real secret).
15/// 2. The decoded variant has at least 8 ASCII alphanumeric chars in a
16/// contiguous run (matches AWS / GitHub / Slack token shapes).
17///
18/// Both checks together keep the chunk count flat on prose-heavy inputs.
19///
20/// Source-code files are skipped entirely. Real secrets are never Caesar-
21/// encoded inside source - the 25-shift fan-out on every prose-comment in
22/// a codebase just hallucinates detector matches from random letter runs
23/// (helicone-api-key on a `//! Source trait` doc comment was the original
24/// reproducer; see dogfood-2026-05-21.md finding #5).
25pub struct CaesarDecoder;
26
27const MIN_CAESAR_LEN: usize = 16;
28const MIN_ALNUM_RUN: usize = 8;
29
30/// File extensions where Caesar-decoding is pure noise. Matched against the
31/// suffix of `chunk.metadata.path` (lower-cased). Kept short - only the
32/// dominant source-code extensions a scanner is realistically pointed at.
33const SOURCE_CODE_EXTENSIONS: &[&str] = &[
34 ".rs", ".py", ".go", ".js", ".jsx", ".ts", ".tsx", ".java", ".kt", ".scala", ".c", ".cc",
35 ".cpp", ".cxx", ".h", ".hh", ".hpp", ".cs", ".rb", ".php", ".swift", ".m", ".mm", ".sh",
36 ".bash", ".zsh", ".fish", ".lua", ".pl", ".pm", ".sql", ".html", ".htm", ".css", ".scss",
37 ".sass", ".vue", ".svelte", ".md", ".rst", ".txt", ".adoc",
38];
39
40pub fn is_source_code_path(path: Option<&str>) -> bool {
41 let Some(p) = path else { return false };
42 let lower = p.to_ascii_lowercase();
43 SOURCE_CODE_EXTENSIONS
44 .iter()
45 .any(|ext| lower.ends_with(ext))
46}
47
48impl Decoder for CaesarDecoder {
49 fn name(&self) -> &'static str {
50 "caesar"
51 }
52
53 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
54 // Refuse to recurse on our own output: shifting all 25 non-trivial
55 // shifts on a previous output's would re-shift back to the original
56 // (one of those 25 covers it) and trip evasion-aware downstream
57 // logic. One pass per input is enough.
58 if chunk.metadata.source_type.contains("/caesar") {
59 return Vec::new();
60 }
61 if is_source_code_path(chunk.metadata.path.as_deref()) {
62 return Vec::new();
63 }
64 let mut out = Vec::new();
65 for candidate in extract_encoded_values(&chunk.data) {
66 if candidate.len() < MIN_CAESAR_LEN {
67 continue;
68 }
69 // kimi-decode audit: caesar_shift is the identity for
70 // digits / punctuation / non-ASCII. A pure-digit candidate
71 // (e.g. a 16-digit PIN) produces 25 IDENTICAL shifts, all
72 // equal to the original. The seen-set later dedups them
73 // but each unnecessarily walks the full detector pipeline
74 // and emits a bare decoded chunk that scans the same text
75 // we already scanned in the parent. Skip if the input has
76 // no a-z/A-Z character to shift.
77 if !candidate.chars().any(|c| c.is_ascii_alphabetic()) {
78 continue;
79 }
80 for shift in 1..=25u8 {
81 let decoded = caesar_shift(&candidate, shift);
82 if !looks_credential_shaped(&decoded) {
83 continue;
84 }
85 // NOTE: we intentionally use the non-spliced push.
86 // Splicing the decoded variant back into the parent
87 // (which the base64/hex paths do for companion-anchor
88 // preservation) is wrong for Caesar: Caesar produces
89 // 25 candidate shifts per blob, of which several can
90 // randomly satisfy hex/UUID shape gates. Splicing
91 // those into the parent multiplies findings under
92 // keyword-anchored detectors with shifted credentials
93 // that don't match the ground-truth value the user
94 // planted. Caesar's value is the bare decoded
95 // candidate; let it surface as its own chunk so the
96 // dedup layer can collapse identical findings.
97 push_decoded_text_chunk(&mut out, chunk, decoded, self.name());
98 }
99 }
100 out
101 }
102}
103
104pub fn caesar_shift(input: &str, shift: u8) -> String {
105 let mut out = String::with_capacity(input.len());
106 for ch in input.chars() {
107 let shifted = match ch {
108 'A'..='Z' => {
109 let base = b'A';
110 let off = (ch as u8 - base + shift) % 26;
111 (base + off) as char
112 }
113 'a'..='z' => {
114 let base = b'a';
115 let off = (ch as u8 - base + shift) % 26;
116 (base + off) as char
117 }
118 _ => ch,
119 };
120 out.push(shifted);
121 }
122 out
123}
124
125pub fn looks_credential_shaped(s: &str) -> bool {
126 let bytes = s.as_bytes();
127 if !bytes.iter().any(|b| b.is_ascii_digit()) {
128 return false;
129 }
130 let mut run = 0usize;
131 let mut saw_long_run = false;
132 for &b in bytes {
133 if b.is_ascii_alphanumeric() {
134 run += 1;
135 if run >= MIN_ALNUM_RUN {
136 saw_long_run = true;
137 break;
138 }
139 } else {
140 run = 0;
141 }
142 }
143 if !saw_long_run {
144 return false;
145 }
146 // Same rationale as `reverse::looks_reversible`: gate on a known
147 // provider prefix appearing in the decoded text. Without this, any
148 // Caesar shift of a credential-shaped input (e.g. `sk_live_...`
149 // shifted +23 → `ph_ifsb_...`) gets emitted as a decoded chunk
150 // whose substrings can incidentally collide with detector regexes
151 // (`sb_4bZ39EnIvgT...` matches the stackblitz `sb_[a-zA-Z0-9_-]{20,}`
152 // regex purely by letter coincidence). The downstream
153 // `should_suppress_named_detector_finding` bypasses the
154 // EXAMPLE / INSERT / CHANGE / REPLACE markers for `/caesar`
155 // source_types (because evasion-decoded inputs CAN legitimately
156 // be a planted-credential rotation), so the gate has to happen
157 // here at decoder-output time.
158 crate::confidence::KNOWN_PREFIXES
159 .iter()
160 .any(|prefix| s.contains(prefix))
161}