keyhog_scanner/decode/caesar.rs
1use super::pipeline::{extract_encoded_values, push_decoded_text_chunk};
2use super::Decoder;
3use aho_corasick::AhoCorasick;
4use keyhog_core::Chunk;
5use std::sync::LazyLock;
6
7/// Caesar/ROT13/ROT-N decoder. A handful of malware-config dumps and CTF
8/// fixtures store their tokens ROT13'd (`AKIA...` → `NXVN...`). For every
9/// candidate ≥ 16 chars, emit decoded variants for the 25 non-trivial Caesar
10/// shifts that produce a *plausibly credential-shaped* string.
11///
12/// "Plausibly shaped" gates the explosion: a 100-char chunk would otherwise
13/// produce 25 sibling chunks per candidate. We require:
14/// 1. The decoded variant contains ≥1 ASCII digit (most modern API key
15/// formats include digits - pure-letter Caesar output rarely indicates
16/// a real secret).
17/// 2. The decoded variant has at least 8 ASCII alphanumeric chars in a
18/// contiguous run (matches AWS / GitHub / Slack token shapes).
19///
20/// Both checks together keep the chunk count flat on prose-heavy inputs.
21///
22/// Source-code files are skipped entirely. Real secrets are never Caesar-
23/// encoded inside source - the 25-shift fan-out on every prose-comment in
24/// a codebase just hallucinates detector matches from random letter runs
25/// (helicone-api-key on a `//! Source trait` doc comment was the original
26/// reproducer; see dogfood-2026-05-21.md finding #5).
27pub struct CaesarDecoder;
28
29const MIN_CAESAR_LEN: usize = 16;
30const MIN_ALNUM_RUN: usize = 8;
31
32/// Aho-Corasick over the "rotated known-prefix" needle set: for every
33/// [`crate::confidence::KNOWN_PREFIXES`] entry `P` and every non-trivial shift
34/// `k` in `1..=25`, the string `caesar_shift(P, 26 - k)` — i.e. `P` with its
35/// ASCII letters rotated BACKWARD by `k` (digits / punctuation fixed).
36///
37/// SOUNDNESS (recall-exact, not merely a superset). `caesar_shift(_, k)` is a
38/// position-wise bijection on a string, so for any candidate `c`:
39/// `caesar_shift(c, k).contains(P)` ⟺ `c.contains(caesar_shift(P, 26 - k))`.
40/// Therefore "some shift in `1..=25` of `c` contains some known prefix" is
41/// EXACTLY "`c` contains some needle in this automaton". The final gate inside
42/// [`looks_credential_shaped`] is precisely that `KNOWN_PREFIXES` substring
43/// test, and its other two gates (≥1 digit, an 8+ alphanumeric run) are
44/// shift-invariant and checked once by [`candidate_shape_invariant`]. So a
45/// candidate that matches NO needle here can never produce a credential-shaped
46/// variant under any shift — its entire 25× `caesar_shift` fan-out + re-scan is
47/// provably dead work and is skipped with zero recall loss. This replaces the
48/// unsound "longest alphabetic run ≥ 16" gate (a `0x` / `SG.` / `hf_` prefix
49/// needs only a 1–2 letter run, so a credential-shaped shift can arise from a
50/// chunk with no long alphabetic run). See `perf_decode_caesar.rs`.
51static ROTATED_PREFIX_AC: LazyLock<Option<AhoCorasick>> = LazyLock::new(|| {
52 let mut needles: Vec<String> = Vec::new();
53 for prefix in crate::confidence::KNOWN_PREFIXES {
54 for k in 1..=25u8 {
55 // rot_{-k}(P) == caesar_shift(P, 26 - k); k in 1..=25 => 26-k in 1..=25.
56 needles.push(caesar_shift(prefix, 26 - k));
57 }
58 }
59 AhoCorasick::new(&needles).ok()
60});
61
62/// File extensions where Caesar-decoding is pure noise. Matched against the
63/// suffix of `chunk.metadata.path` (lower-cased). Kept short - only the
64/// dominant source-code extensions a scanner is realistically pointed at.
65const SOURCE_CODE_EXTENSIONS: &[&str] = &[
66 ".rs", ".py", ".go", ".js", ".jsx", ".ts", ".tsx", ".java", ".kt", ".scala", ".c", ".cc",
67 ".cpp", ".cxx", ".h", ".hh", ".hpp", ".cs", ".rb", ".php", ".swift", ".m", ".mm", ".sh",
68 ".bash", ".zsh", ".fish", ".lua", ".pl", ".pm", ".sql", ".html", ".htm", ".css", ".scss",
69 ".sass", ".vue", ".svelte", ".md", ".rst", ".txt", ".adoc", ".tbl", ".mk", ".cmake",
70];
71
72const SOURCE_CODE_FILENAMES: &[&str] = &["kconfig", "makefile", "cmakelists.txt"];
73
74pub fn is_source_code_path(path: Option<&str>) -> bool {
75 let Some(p) = path else { return false };
76 let lower = p.replace('\\', "/").to_ascii_lowercase();
77 if let Some(file_name) = lower.rsplit('/').next() {
78 if SOURCE_CODE_FILENAMES.contains(&file_name) {
79 return true;
80 }
81 }
82 SOURCE_CODE_EXTENSIONS
83 .iter()
84 .any(|ext| lower.ends_with(ext))
85}
86
87/// True when `line` contains a `scheme://user:pass@host` URL with embedded
88/// credentials. The plaintext URL itself is the credential; Caesar /
89/// ROT-N decoding cannot reveal anything new, and (worse) the 25-shift
90/// emission produces a high-confidence decoded chunk whose body wins the
91/// per-line resolution group over the real connection-string detector.
92///
93/// Match shape: `<scheme>://[^/@\s]+:[^/@\s]+@[^\s]+`. The presence of
94/// `:` between scheme and `@` is what distinguishes a credentialled URL
95/// (`postgres://u:p@h`) from a bare host URL (`https://example.com`) -
96/// the bare-host case has no credential to lose, so we leave it alone.
97pub(crate) fn line_has_credential_url(line: &str) -> bool {
98 let Some(scheme_end) = line.find("://") else {
99 return false;
100 };
101 // Scheme must be 2+ alphabetic bytes immediately before `://`.
102 let scheme_bytes = line[..scheme_end].as_bytes();
103 let scheme_ok = scheme_bytes.len() >= 2
104 && scheme_bytes
105 .iter()
106 .rev()
107 .take_while(|b| b.is_ascii_alphabetic() || **b == b'+')
108 .count()
109 >= 2;
110 if !scheme_ok {
111 return false;
112 }
113 let rest = &line[scheme_end + 3..];
114 // Walk userinfo: bytes up to the FIRST `/` or whitespace. The first
115 // `@` in that span splits user[:pass]@host. Require a `:` BEFORE the
116 // `@` so we only match URLs with embedded passwords.
117 let userinfo_end = rest
118 .find(|c: char| c == '/' || c == '?' || c == '#' || c.is_ascii_whitespace())
119 .unwrap_or(rest.len());
120 let userinfo = &rest[..userinfo_end];
121 let Some(at_pos) = userinfo.find('@') else {
122 return false;
123 };
124 userinfo[..at_pos].contains(':')
125}
126
127impl Decoder for CaesarDecoder {
128 fn name(&self) -> &'static str {
129 "caesar"
130 }
131
132 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
133 // Refuse to recurse on our own output: shifting all 25 non-trivial
134 // shifts on a previous output's would re-shift back to the original
135 // (one of those 25 covers it) and trip evasion-aware downstream
136 // logic. One pass per input is enough.
137 if chunk.metadata.source_type.contains("/caesar") {
138 return Vec::new();
139 }
140 if is_source_code_path(chunk.metadata.path.as_deref()) {
141 return Vec::new();
142 }
143 let mut out = Vec::new();
144 // Skip Caesar on chunks whose lines already carry a URL with
145 // embedded credentials (`scheme://user:pass@host`). Every db
146 // connection-string URL is plaintext-readable already, so the
147 // 25-shift fan-out cannot reveal new information; its only
148 // observed effect is to emit a high-confidence garbage finding
149 // whose decoded body out-resolves the real URL match during the
150 // per-line resolution group. Investigator empirically attributed
151 // the postgres / mongo log-line + .env database FNs to this
152 // exact resolution loss. Gate per-line so a chunk that mixes
153 // URL traffic with Caesar-encoded creds elsewhere still gets
154 // the decoder where it matters.
155 let chunk_has_credential_url = chunk.data.lines().any(line_has_credential_url);
156 if chunk_has_credential_url {
157 return Vec::new();
158 }
159 for candidate in extract_encoded_values(&chunk.data) {
160 if candidate.len() < MIN_CAESAR_LEN {
161 continue;
162 }
163 // SHIFT-INVARIANT PRECONDITION (sound; a true superset of "some
164 // shift is credential-shaped"). `caesar_shift` maps letter->letter,
165 // digit->digit, other->other, so the two structural gates inside
166 // `looks_credential_shaped` are identical for the candidate and ALL
167 // 25 of its shifts:
168 // * "contains >=1 ASCII digit" - digits are shift-identity
169 // * "has an 8+ ASCII-ALPHANUMERIC run" - alnum-ness is preserved
170 // If the RAW candidate fails either gate, NONE of its 25 shifts can
171 // pass `looks_credential_shaped`, so we skip the entire 25x
172 // `caesar_shift` allocation + re-scan loop for it. Only the
173 // KNOWN_PREFIXES check (the one gate a shift CAN newly satisfy) is
174 // left to the per-shift loop. This is byte-for-byte recall-
175 // equivalent - it removes pure-waste allocations, it does not gate
176 // out any shift that could have been shaped (unlike an
177 // alphabetic-run length gate, which is unsound: a `0x`/`SG.`/`hf_`
178 // prefix needs only a 1-2 letter run, so a credential-shaped shift
179 // can arise from a chunk with no long alphabetic run at all).
180 if !candidate_shape_invariant(&candidate) {
181 continue;
182 }
183 // Rotated-prefix prefilter: a credential-shaped variant's final gate
184 // is a KNOWN_PREFIXES substring in the SHIFTED text. Because a shift
185 // is a position-wise bijection, that is equivalent to the RAW
186 // candidate containing a `rot_{-k}(prefix)` needle for some k — one
187 // Aho-Corasick pass tests all 38×25 needles at once. No needle hit
188 // means no shift can satisfy `looks_credential_shaped`, so the 25×
189 // `caesar_shift` allocation + re-scan fan-out below is skipped. This
190 // is recall-EXACT (see ROTATED_PREFIX_AC); the per-shift loop still
191 // confirms each surviving candidate via the full predicate.
192 if let Some(ac) = ROTATED_PREFIX_AC.as_ref() {
193 if !ac.is_match(candidate.as_str()) {
194 continue;
195 }
196 }
197 for shift in 1..=25u8 {
198 let decoded = caesar_shift(&candidate, shift);
199 if !looks_credential_shaped(&decoded) {
200 continue;
201 }
202 // NOTE: we intentionally use the non-spliced push.
203 // Splicing the decoded variant back into the parent
204 // (which the base64/hex paths do for companion-anchor
205 // preservation) is wrong for Caesar: Caesar produces
206 // 25 candidate shifts per blob, of which several can
207 // randomly satisfy hex/UUID shape gates. Splicing
208 // those into the parent multiplies findings under
209 // keyword-anchored detectors with shifted credentials
210 // that don't match the ground-truth value the user
211 // planted. Caesar's value is the bare decoded
212 // candidate; let it surface as its own chunk so the
213 // dedup layer can collapse identical findings.
214 push_decoded_text_chunk(&mut out, chunk, decoded, self.name());
215 }
216 }
217 out
218 }
219}
220
221/// Shift-invariant half of `looks_credential_shaped`, evaluated ONCE on the raw
222/// candidate before the 25x shift loop. A Caesar/ROT-N shift is a permutation
223/// within the letters and the identity on digits/punctuation, so both of these
224/// gates produce the SAME answer for the candidate and for every one of its 25
225/// shifts:
226/// 1. at least one ASCII digit (digits are never moved by a shift), and
227/// 2. an 8+ contiguous ASCII-alphanumeric run (alphanumeric-ness of each
228/// byte is preserved under a shift).
229/// If the raw candidate fails either, no shift can satisfy
230/// `looks_credential_shaped`, so the whole 25-allocation fan-out for that
231/// candidate is pure waste and is skipped. This is a true SUPERSET of the
232/// per-shift `looks_credential_shaped` predicate (it only ever short-circuits
233/// candidates that would have produced zero shaped shifts), so it is exactly
234/// recall-preserving. It deliberately does NOT pre-check the KNOWN_PREFIXES
235/// substring - that is the one gate a shift CAN newly satisfy by rotating
236/// letters into a prefix (e.g. `BLJB`+25 -> `AKIA`), so it stays in the loop.
237fn candidate_shape_invariant(s: &str) -> bool {
238 let bytes = s.as_bytes();
239 if !bytes.iter().any(|b| b.is_ascii_digit()) {
240 return false;
241 }
242 // Must also contain at least one letter for any shift to do anything.
243 if !bytes.iter().any(|b| b.is_ascii_alphabetic()) {
244 return false;
245 }
246 let mut run = 0usize;
247 for &b in bytes {
248 if b.is_ascii_alphanumeric() {
249 run += 1;
250 if run >= MIN_ALNUM_RUN {
251 return true;
252 }
253 } else {
254 run = 0;
255 }
256 }
257 false
258}
259
260pub fn caesar_shift(input: &str, shift: u8) -> String {
261 let mut out = String::with_capacity(input.len());
262 for ch in input.chars() {
263 let shifted = match ch {
264 'A'..='Z' => {
265 let base = b'A';
266 let off = (ch as u8 - base + shift) % 26;
267 (base + off) as char
268 }
269 'a'..='z' => {
270 let base = b'a';
271 let off = (ch as u8 - base + shift) % 26;
272 (base + off) as char
273 }
274 _ => ch,
275 };
276 out.push(shifted);
277 }
278 out
279}
280
281pub fn looks_credential_shaped(s: &str) -> bool {
282 let bytes = s.as_bytes();
283 if !bytes.iter().any(|b| b.is_ascii_digit()) {
284 return false;
285 }
286 let mut run = 0usize;
287 let mut saw_long_run = false;
288 for &b in bytes {
289 if b.is_ascii_alphanumeric() {
290 run += 1;
291 if run >= MIN_ALNUM_RUN {
292 saw_long_run = true;
293 break;
294 }
295 } else {
296 run = 0;
297 }
298 }
299 if !saw_long_run {
300 return false;
301 }
302 // Same rationale as `reverse::looks_reversible`: gate on a known
303 // provider prefix appearing in the decoded text. Without this, any
304 // Caesar shift of a credential-shaped input (e.g. `sk_live_...`
305 // shifted +23 → `ph_ifsb_...`) gets emitted as a decoded chunk
306 // whose substrings can incidentally collide with detector regexes
307 // (`sb_4bZ39EnIvgT...` matches the stackblitz `sb_[a-zA-Z0-9_-]{20,}`
308 // regex purely by letter coincidence). The downstream
309 // `should_suppress_named_detector_finding` bypasses the
310 // EXAMPLE / INSERT / CHANGE / REPLACE markers for `/caesar`
311 // source_types (because evasion-decoded inputs CAN legitimately
312 // be a planted-credential rotation), so the gate has to happen
313 // here at decoder-output time.
314 crate::confidence::KNOWN_PREFIXES
315 .iter()
316 .any(|prefix| s.contains(prefix))
317}