keyhog_scanner/compiler_build.rs
1//! Logic for compiling detector specifications into an efficient scanning engine.
2
3use crate::error::Result;
4use crate::types::*;
5use keyhog_core::DetectorSpec;
6
7use super::compiler_prefix::{extract_inner_literals, extract_literal_prefixes};
8
9use super::compiler_compile::{compile_detector_companions, compile_pattern};
10
11pub struct CompileState {
12 pub ac_literals: Vec<String>,
13 pub ac_map: Vec<CompiledPattern>,
14 pub fallback: Vec<(CompiledPattern, Vec<String>)>,
15 pub companions: Vec<Vec<CompiledCompanion>>,
16 pub quality_warnings: Vec<String>,
17}
18
19pub fn build_compile_state(detectors: &[DetectorSpec]) -> Result<CompileState> {
20 use rayon::prelude::*;
21
22 // De-duplicate identical regex strings BEFORE compilation. The 888-
23 // detector corpus has ~6-15% duplicate patterns (e.g. multiple
24 // google-* detectors share the `AIza` regex shape). Compiling each
25 // once cuts startup-compile time and RAM proportionally - see
26 // audits/legendary-2026-04-26.
27 //
28 // The count is informational only (one debug log line), so gate the
29 // whole computation behind the DEBUG level check and borrow the regex
30 // sources instead of cloning them. Under any non-debug level this is
31 // zero allocation - it used to heap-clone ~1000+ regex source strings
32 // into an owned HashMap on every scanner construction (every CLI
33 // invocation, every daemon/watch recompile) solely to print the count.
34 if tracing::enabled!(tracing::Level::DEBUG) {
35 let unique = detectors
36 .iter()
37 .flat_map(|d| d.patterns.iter().map(|p| p.regex.as_str()))
38 .collect::<std::collections::HashSet<&str>>()
39 .len();
40 tracing::debug!(unique, "compiler dedup: unique pattern regexes");
41 }
42
43 // Phase 1: Pre-compile all regexes in parallel (the expensive part).
44 let compiled_results: Vec<Result<(Vec<CompiledPattern>, Vec<CompiledCompanion>)>> = detectors
45 .par_iter()
46 .enumerate()
47 .map(|(detector_index, detector)| {
48 let companions = compile_detector_companions(detector)?;
49 let mut patterns = Vec::new();
50 for (pattern_index, pattern) in detector.patterns.iter().enumerate() {
51 patterns.push(compile_pattern(
52 detector_index,
53 pattern_index,
54 pattern,
55 &detector.id,
56 )?);
57 }
58 Ok((patterns, companions))
59 })
60 .collect();
61
62 // Phase 2: Assemble results sequentially (fast, no regex compilation).
63 let mut ac_literals = Vec::new();
64 let mut ac_map = Vec::new();
65 let mut fallback = Vec::new();
66 let mut companions = Vec::with_capacity(detectors.len());
67 let mut quality_warnings = Vec::new();
68
69 for (detector_index, (result, detector)) in compiled_results
70 .into_iter()
71 .zip(detectors.iter())
72 .enumerate()
73 {
74 let (compiled_patterns, detector_companions) = result?;
75 companions.push(detector_companions);
76
77 for (pattern_index, (compiled, pattern)) in compiled_patterns
78 .into_iter()
79 .zip(detector.patterns.iter())
80 .enumerate()
81 {
82 let prefixes = extract_literal_prefixes(&pattern.regex);
83
84 // Homoglyph expansion for high-confidence patterns: catches
85 // tokens where the literal prefix has been visually spoofed
86 // with Cyrillic/Greek/full-width lookalikes. Earlier code
87 // dropped just the expanded PREFIX into fallback as
88 // `Regex::new("^[hh][ff]_")` - anchored to start, but with
89 // NO body constraint, so any string beginning with the
90 // prefix would match. Combined with the task #69 fallback
91 // wire fix that finally runs these patterns, that turned
92 // every prefix-anchored detector into "fires on `<prefix>*`."
93 // Fix: substitute the expanded prefix into the FULL regex so
94 // the homoglyph variant still requires the rest of the
95 // pattern to match.
96 for prefix in &prefixes {
97 if prefix.len() < 3 {
98 continue;
99 }
100 let expanded_prefix = crate::homoglyph::expand_homoglyphs(prefix);
101 if expanded_prefix == *prefix {
102 continue;
103 }
104 let full_homoglyph_regex =
105 if let Some(suffix) = pattern.regex.strip_prefix(prefix.as_str()) {
106 // Simple case: prefix is the literal head of the regex.
107 format!("{expanded_prefix}{suffix}")
108 } else if let Some(rewritten) =
109 rewrite_alternation_prefix(&pattern.regex, prefix, &expanded_prefix)
110 {
111 // Alternation case: regex is `(?:p1|p2|...)body`. Replace
112 // the leading `(?:...)` with the expanded prefix so the
113 // homoglyph variant still requires the rest of the pattern
114 // to match. Without this, every alternation-prefix detector
115 // silently skipped its homoglyph fallback - leaving
116 // Cyrillic/full-width spoofed credentials of the form
117 // `[ɡ̅р][hн]p_<body>` invisible to the scanner.
118 rewritten
119 } else {
120 // Prefix appears in the parse tree but isn't a leading
121 // literal slice and isn't a trivially-rewritable alternation
122 // (e.g. it sits inside a nested group). Skip - there's no
123 // safe text rewrite we can do here.
124 continue;
125 };
126 // Deferred like every other pattern: build the homoglyph
127 // variant's Regex on first use, not here. The old eager
128 // `Regex::new` doubled as a validity gate (skip-if-Err); the
129 // lazy path's never-match fallback covers a non-compiling
130 // variant instead, so a bad expansion simply never fires
131 // rather than being silently dropped at build.
132 fallback.push((
133 CompiledPattern {
134 detector_index,
135 regex: LazyRegex::plain(full_homoglyph_regex),
136 group: pattern.group,
137 client_safe: pattern.client_safe,
138 },
139 Vec::new(),
140 ));
141 }
142
143 if !prefixes.is_empty() {
144 for prefix in prefixes {
145 ac_literals.push(prefix);
146 ac_map.push(compiled.clone());
147 }
148 } else {
149 // Prefix extraction failed - try the AST-walking inner-literal
150 // extractor before falling back. Patterns like
151 // `[a-zA-Z0-9]{20}_AKIA[A-Z0-9]{16}` have no leading literal
152 // but contain `_AKIA` mid-pattern; pulling that into the AC
153 // moves the detector out of the O(m × n) fallback loop and
154 // into the O(n) prefilter path.
155 let inner = extract_inner_literals(&pattern.regex);
156 if !inner.is_empty() {
157 for lit in inner {
158 ac_literals.push(lit);
159 ac_map.push(compiled.clone());
160 }
161 } else {
162 if detector.keywords.is_empty() {
163 quality_warnings.push(format!(
164 "Detector {} pattern {pattern_index} has no literal prefix and no keywords.",
165 detector.id
166 ));
167 }
168 fallback.push((compiled, detector.keywords.clone()));
169 }
170 }
171 }
172 }
173
174 Ok(CompileState {
175 ac_literals,
176 ac_map,
177 fallback,
178 companions,
179 quality_warnings,
180 })
181}
182
183/// If `regex` is `(?:p1|p2|...)body` (with optional inline flags / `?:`
184/// variants), replace the leading alternation group with `expanded_prefix`.
185/// Returns the rewritten regex source; returns `None` if the regex doesn't
186/// start with a non-capturing alternation group we know how to rewrite.
187///
188/// This is the homoglyph counterpart of `extract_literal_prefixes`'s
189/// alternation handling - when the prefix extractor returned a literal
190/// from inside `(?:ghp_|github_pat_)`, the homoglyph compiler needs the
191/// matching surgical rewrite to splice the expanded prefix into the
192/// regex without losing the trailing body constraint.
193pub fn rewrite_alternation_prefix(
194 regex: &str,
195 prefix: &str,
196 expanded_prefix: &str,
197) -> Option<String> {
198 // Strip a leading inline flag group like `(?i)`.
199 let (flag_prefix, body) = split_leading_inline_flag(regex);
200 // Only consider non-capturing groups - `(?:p1|p2|...)`. A bare
201 // `(...)` is a capturing group around the whole credential, NOT an
202 // alternation of prefixes; rewriting it as "{expanded_prefix}{suffix}"
203 // would drop the credential body and leave a regex that matches just
204 // the prefix. That was the flutterwave false-positive on negative:
205 // `(FLWSECK_(?:TEST|LIVE)-[a-f0-9]{32,64}-X)` got rewritten to
206 // `FLW[SСS][EЕΕE]C[KКΚK]_` which then matched bare `FLWSECK_`
207 // anywhere in the text.
208 let group_open_end = if let Some(rest) = body.strip_prefix("(?:") {
209 body.len() - rest.len()
210 } else if let Some(rest) = body.strip_prefix("(?i:") {
211 body.len() - rest.len()
212 } else if let Some(rest) = body.strip_prefix("(?m:") {
213 body.len() - rest.len()
214 } else if let Some(rest) = body.strip_prefix("(?s:") {
215 body.len() - rest.len()
216 } else if let Some(rest) = body.strip_prefix("(?im:") {
217 body.len() - rest.len()
218 } else if let Some(rest) = body.strip_prefix("(?is:") {
219 body.len() - rest.len()
220 } else if let Some(rest) = body.strip_prefix("(?ms:") {
221 body.len() - rest.len()
222 } else {
223 // Bare `(` or no leading group - refuse to rewrite. The simple
224 // strip_prefix path in the caller handles literal-head regexes;
225 // this function is strictly for `(?:...)` alternation prefixes.
226 return None;
227 };
228 // Find the matching closing `)` for the leading group.
229 let bytes = body.as_bytes();
230 let mut depth: i32 = 0;
231 let mut close_at: Option<usize> = None;
232 for (i, &b) in bytes.iter().enumerate() {
233 match b {
234 b'(' => depth += 1,
235 b')' => {
236 depth -= 1;
237 if depth == 0 {
238 close_at = Some(i);
239 break;
240 }
241 }
242 // Don't track escapes - we only need to find the *top-level*
243 // closing paren, and within a regex source a literal `(` or
244 // `)` inside a character class is rare in real detectors.
245 _ => {}
246 }
247 }
248 let close = close_at?;
249 // The leading group must actually contain a `|` - without one this
250 // is just `(?:singleton)pattern`, not an alternation, and rewriting
251 // would silently drop the singleton body.
252 let inside = &body[group_open_end..close];
253 if !inside.contains('|') {
254 return None;
255 }
256 // Trailing body after the alternation group.
257 let suffix = &body[close + 1..];
258 for alt in split_top_level_alternatives(inside) {
259 if let Some(branch_suffix) = alt.strip_prefix(prefix) {
260 return Some(format!(
261 "{flag_prefix}{expanded_prefix}{branch_suffix}{suffix}"
262 ));
263 }
264 }
265 None
266}
267
268fn split_top_level_alternatives(group: &str) -> Vec<&str> {
269 let mut alts = Vec::new();
270 let mut start = 0;
271 let mut depth = 0i32;
272 let mut in_class = false;
273 let mut escaped = false;
274 for (idx, ch) in group.char_indices() {
275 if escaped {
276 escaped = false;
277 continue;
278 }
279 match ch {
280 '\\' => escaped = true,
281 '[' if !in_class => in_class = true,
282 ']' if in_class => in_class = false,
283 '(' if !in_class => depth += 1,
284 ')' if !in_class => depth -= 1,
285 '|' if depth == 0 && !in_class => {
286 alts.push(&group[start..idx]);
287 start = idx + ch.len_utf8();
288 }
289 _ => {}
290 }
291 }
292 alts.push(&group[start..]);
293 alts
294}
295
296pub fn split_leading_inline_flag(s: &str) -> (&str, &str) {
297 if !s.starts_with("(?") {
298 return ("", s);
299 }
300 let bytes = s.as_bytes();
301 let mut i = 2;
302 while i < bytes.len() && matches!(bytes[i], b'i' | b'm' | b's' | b'x' | b'u' | b'U') {
303 i += 1;
304 }
305 if i < bytes.len() && bytes[i] == b')' {
306 (&s[..=i], &s[i + 1..])
307 } else {
308 ("", s)
309 }
310}