Skip to main content

keyhog_scanner/
compiler_build.rs

1//! Logic for compiling detector specifications into an efficient scanning engine.
2
3use crate::error::Result;
4use crate::types::*;
5use keyhog_core::DetectorSpec;
6
7use super::compiler_prefix::{extract_inner_literals, extract_literal_prefixes};
8
9use super::compiler_compile::{compile_detector_companions, compile_pattern};
10
11pub struct CompileState {
12    pub ac_literals: Vec<String>,
13    pub ac_map: Vec<CompiledPattern>,
14    pub fallback: Vec<(CompiledPattern, Vec<String>)>,
15    pub companions: Vec<Vec<CompiledCompanion>>,
16    pub quality_warnings: Vec<String>,
17}
18
19pub fn build_compile_state(detectors: &[DetectorSpec]) -> Result<CompileState> {
20    use rayon::prelude::*;
21
22    // De-duplicate identical regex strings BEFORE compilation. The 888-
23    // detector corpus has ~6-15% duplicate patterns (e.g. multiple
24    // google-* detectors share the `AIza` regex shape). Compiling each
25    // once cuts startup-compile time and RAM proportionally - see
26    // audits/legendary-2026-04-26.
27    //
28    // The count is informational only (one debug log line), so gate the
29    // whole computation behind the DEBUG level check and borrow the regex
30    // sources instead of cloning them. Under any non-debug level this is
31    // zero allocation - it used to heap-clone ~1000+ regex source strings
32    // into an owned HashMap on every scanner construction (every CLI
33    // invocation, every daemon/watch recompile) solely to print the count.
34    if tracing::enabled!(tracing::Level::DEBUG) {
35        let unique = detectors
36            .iter()
37            .flat_map(|d| d.patterns.iter().map(|p| p.regex.as_str()))
38            .collect::<std::collections::HashSet<&str>>()
39            .len();
40        tracing::debug!(unique, "compiler dedup: unique pattern regexes");
41    }
42
43    // Phase 1: Pre-compile all regexes in parallel (the expensive part).
44    let compiled_results: Vec<Result<(Vec<CompiledPattern>, Vec<CompiledCompanion>)>> = detectors
45        .par_iter()
46        .enumerate()
47        .map(|(detector_index, detector)| {
48            let companions = compile_detector_companions(detector)?;
49            let mut patterns = Vec::new();
50            for (pattern_index, pattern) in detector.patterns.iter().enumerate() {
51                patterns.push(compile_pattern(
52                    detector_index,
53                    pattern_index,
54                    pattern,
55                    &detector.id,
56                )?);
57            }
58            Ok((patterns, companions))
59        })
60        .collect();
61
62    // Phase 2: Assemble results sequentially (fast, no regex compilation).
63    let mut ac_literals = Vec::new();
64    let mut ac_map = Vec::new();
65    let mut fallback = Vec::new();
66    let mut companions = Vec::with_capacity(detectors.len());
67    let mut quality_warnings = Vec::new();
68
69    for (detector_index, (result, detector)) in compiled_results
70        .into_iter()
71        .zip(detectors.iter())
72        .enumerate()
73    {
74        let (compiled_patterns, detector_companions) = result?;
75        companions.push(detector_companions);
76
77        for (pattern_index, (compiled, pattern)) in compiled_patterns
78            .into_iter()
79            .zip(detector.patterns.iter())
80            .enumerate()
81        {
82            let prefixes = extract_literal_prefixes(&pattern.regex);
83
84            // Homoglyph expansion for high-confidence patterns: catches
85            // tokens where the literal prefix has been visually spoofed
86            // with Cyrillic/Greek/full-width lookalikes. Earlier code
87            // dropped just the expanded PREFIX into fallback as
88            // `Regex::new("^[hh][ff]_")` - anchored to start, but with
89            // NO body constraint, so any string beginning with the
90            // prefix would match. Combined with the task #69 fallback
91            // wire fix that finally runs these patterns, that turned
92            // every prefix-anchored detector into "fires on `<prefix>*`."
93            // Fix: substitute the expanded prefix into the FULL regex so
94            // the homoglyph variant still requires the rest of the
95            // pattern to match.
96            for prefix in &prefixes {
97                if prefix.len() < 3 {
98                    continue;
99                }
100                let expanded_prefix = crate::homoglyph::expand_homoglyphs(prefix);
101                if expanded_prefix == *prefix {
102                    continue;
103                }
104                let full_homoglyph_regex =
105                    if let Some(suffix) = pattern.regex.strip_prefix(prefix.as_str()) {
106                        // Simple case: prefix is the literal head of the regex.
107                        format!("{expanded_prefix}{suffix}")
108                    } else if let Some(rewritten) =
109                        rewrite_alternation_prefix(&pattern.regex, prefix, &expanded_prefix)
110                    {
111                        // Alternation case: regex is `(?:p1|p2|...)body`. Replace
112                        // the leading `(?:...)` with the expanded prefix so the
113                        // homoglyph variant still requires the rest of the pattern
114                        // to match. Without this, every alternation-prefix detector
115                        // silently skipped its homoglyph fallback - leaving
116                        // Cyrillic/full-width spoofed credentials of the form
117                        // `[ɡ̅р][hн]p_<body>` invisible to the scanner.
118                        rewritten
119                    } else {
120                        // Prefix appears in the parse tree but isn't a leading
121                        // literal slice and isn't a trivially-rewritable alternation
122                        // (e.g. it sits inside a nested group). Skip - there's no
123                        // safe text rewrite we can do here.
124                        continue;
125                    };
126                // Deferred like every other pattern: build the homoglyph
127                // variant's Regex on first use, not here. The old eager
128                // `Regex::new` doubled as a validity gate (skip-if-Err); the
129                // lazy path's never-match fallback covers a non-compiling
130                // variant instead, so a bad expansion simply never fires
131                // rather than being silently dropped at build.
132                fallback.push((
133                    CompiledPattern {
134                        detector_index,
135                        regex: LazyRegex::plain(full_homoglyph_regex),
136                        group: pattern.group,
137                        client_safe: pattern.client_safe,
138                    },
139                    Vec::new(),
140                ));
141            }
142
143            if !prefixes.is_empty() {
144                for prefix in prefixes {
145                    ac_literals.push(prefix);
146                    ac_map.push(compiled.clone());
147                }
148            } else {
149                // Prefix extraction failed - try the AST-walking inner-literal
150                // extractor before falling back. Patterns like
151                // `[a-zA-Z0-9]{20}_AKIA[A-Z0-9]{16}` have no leading literal
152                // but contain `_AKIA` mid-pattern; pulling that into the AC
153                // moves the detector out of the O(m × n) fallback loop and
154                // into the O(n) prefilter path.
155                let inner = extract_inner_literals(&pattern.regex);
156                if !inner.is_empty() {
157                    for lit in inner {
158                        ac_literals.push(lit);
159                        ac_map.push(compiled.clone());
160                    }
161                } else {
162                    if detector.keywords.is_empty() {
163                        quality_warnings.push(format!(
164                            "Detector {} pattern {pattern_index} has no literal prefix and no keywords.",
165                            detector.id
166                        ));
167                    }
168                    fallback.push((compiled, detector.keywords.clone()));
169                }
170            }
171        }
172    }
173
174    Ok(CompileState {
175        ac_literals,
176        ac_map,
177        fallback,
178        companions,
179        quality_warnings,
180    })
181}
182
183/// If `regex` is `(?:p1|p2|...)body` (with optional inline flags / `?:`
184/// variants), replace the leading alternation group with `expanded_prefix`.
185/// Returns the rewritten regex source; returns `None` if the regex doesn't
186/// start with a non-capturing alternation group we know how to rewrite.
187///
188/// This is the homoglyph counterpart of `extract_literal_prefixes`'s
189/// alternation handling - when the prefix extractor returned a literal
190/// from inside `(?:ghp_|github_pat_)`, the homoglyph compiler needs the
191/// matching surgical rewrite to splice the expanded prefix into the
192/// regex without losing the trailing body constraint.
193pub fn rewrite_alternation_prefix(
194    regex: &str,
195    prefix: &str,
196    expanded_prefix: &str,
197) -> Option<String> {
198    // Strip a leading inline flag group like `(?i)`.
199    let (flag_prefix, body) = split_leading_inline_flag(regex);
200    // Only consider non-capturing groups - `(?:p1|p2|...)`. A bare
201    // `(...)` is a capturing group around the whole credential, NOT an
202    // alternation of prefixes; rewriting it as "{expanded_prefix}{suffix}"
203    // would drop the credential body and leave a regex that matches just
204    // the prefix. That was the flutterwave false-positive on negative:
205    // `(FLWSECK_(?:TEST|LIVE)-[a-f0-9]{32,64}-X)` got rewritten to
206    // `FLW[SСS][EЕΕE]C[KКΚK]_` which then matched bare `FLWSECK_`
207    // anywhere in the text.
208    let group_open_end = if let Some(rest) = body.strip_prefix("(?:") {
209        body.len() - rest.len()
210    } else if let Some(rest) = body.strip_prefix("(?i:") {
211        body.len() - rest.len()
212    } else if let Some(rest) = body.strip_prefix("(?m:") {
213        body.len() - rest.len()
214    } else if let Some(rest) = body.strip_prefix("(?s:") {
215        body.len() - rest.len()
216    } else if let Some(rest) = body.strip_prefix("(?im:") {
217        body.len() - rest.len()
218    } else if let Some(rest) = body.strip_prefix("(?is:") {
219        body.len() - rest.len()
220    } else if let Some(rest) = body.strip_prefix("(?ms:") {
221        body.len() - rest.len()
222    } else {
223        // Bare `(` or no leading group - refuse to rewrite. The simple
224        // strip_prefix path in the caller handles literal-head regexes;
225        // this function is strictly for `(?:...)` alternation prefixes.
226        return None;
227    };
228    // Find the matching closing `)` for the leading group.
229    let bytes = body.as_bytes();
230    let mut depth: i32 = 0;
231    let mut close_at: Option<usize> = None;
232    for (i, &b) in bytes.iter().enumerate() {
233        match b {
234            b'(' => depth += 1,
235            b')' => {
236                depth -= 1;
237                if depth == 0 {
238                    close_at = Some(i);
239                    break;
240                }
241            }
242            // Don't track escapes - we only need to find the *top-level*
243            // closing paren, and within a regex source a literal `(` or
244            // `)` inside a character class is rare in real detectors.
245            _ => {}
246        }
247    }
248    let close = close_at?;
249    // The leading group must actually contain a `|` - without one this
250    // is just `(?:singleton)pattern`, not an alternation, and rewriting
251    // would silently drop the singleton body.
252    let inside = &body[group_open_end..close];
253    if !inside.contains('|') {
254        return None;
255    }
256    // Trailing body after the alternation group.
257    let suffix = &body[close + 1..];
258    for alt in split_top_level_alternatives(inside) {
259        if let Some(branch_suffix) = alt.strip_prefix(prefix) {
260            return Some(format!(
261                "{flag_prefix}{expanded_prefix}{branch_suffix}{suffix}"
262            ));
263        }
264    }
265    None
266}
267
268fn split_top_level_alternatives(group: &str) -> Vec<&str> {
269    let mut alts = Vec::new();
270    let mut start = 0;
271    let mut depth = 0i32;
272    let mut in_class = false;
273    let mut escaped = false;
274    for (idx, ch) in group.char_indices() {
275        if escaped {
276            escaped = false;
277            continue;
278        }
279        match ch {
280            '\\' => escaped = true,
281            '[' if !in_class => in_class = true,
282            ']' if in_class => in_class = false,
283            '(' if !in_class => depth += 1,
284            ')' if !in_class => depth -= 1,
285            '|' if depth == 0 && !in_class => {
286                alts.push(&group[start..idx]);
287                start = idx + ch.len_utf8();
288            }
289            _ => {}
290        }
291    }
292    alts.push(&group[start..]);
293    alts
294}
295
296pub fn split_leading_inline_flag(s: &str) -> (&str, &str) {
297    if !s.starts_with("(?") {
298        return ("", s);
299    }
300    let bytes = s.as_bytes();
301    let mut i = 2;
302    while i < bytes.len() && matches!(bytes[i], b'i' | b'm' | b's' | b'x' | b'u' | b'U') {
303        i += 1;
304    }
305    if i < bytes.len() && bytes[i] == b')' {
306        (&s[..=i], &s[i + 1..])
307    } else {
308        ("", s)
309    }
310}