Skip to main content

keyhog_core/spec/
validate.rs

1//! Detector quality gate validation rules used while loading TOML specs.
2
3use super::DetectorSpec;
4use regex_syntax::ast::{self, Ast};
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8const MAX_REGEX_AST_NODES: usize = 512;
9const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
10const MAX_REGEX_REPEAT_BOUND: u32 = 10_000;
11
12/// Quality issue found in a detector spec.
13///
14/// # Examples
15///
16/// ```rust
17/// use keyhog_core::QualityIssue;
18///
19/// let issue = QualityIssue::Warning("add keywords".into());
20/// assert!(matches!(issue, QualityIssue::Warning(_)));
21/// ```
22#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub enum QualityIssue {
24    Error(String),
25    Warning(String),
26}
27
28/// Validate a detector spec against the quality gate.
29///
30/// # Examples
31///
32/// ```rust
33/// use keyhog_core::{DetectorSpec, PatternSpec, Severity, validate_detector};
34///
35/// let detector = DetectorSpec {
36///     id: "demo".into(),
37///     name: "Demo".into(),
38///     service: "demo".into(),
39///     severity: Severity::High,
40///     patterns: vec![PatternSpec {
41///         regex: "demo_[A-Z0-9]{8}".into(),
42///         description: None,
43///         group: None,
44///     }],
45///     companion: None,
46///     verify: None,
47///     keywords: vec!["demo_".into()],
48/// };
49///
50/// assert!(validate_detector(&detector).is_empty());
51/// ```
52pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
53    let mut issues = Vec::new();
54    validate_patterns_present(spec, &mut issues);
55    validate_regexes(spec, &mut issues);
56    validate_keywords(spec, &mut issues);
57    validate_pattern_specificity(spec, &mut issues);
58    validate_companion(spec, &mut issues);
59    validate_verify_spec(spec, &mut issues);
60    issues
61}
62
63fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
64    if spec.patterns.is_empty() {
65        issues.push(QualityIssue::Error("no patterns defined".into()));
66    }
67}
68
69fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
70    for (i, pat) in spec.patterns.iter().enumerate() {
71        if pat.regex.len() > MAX_REGEX_PATTERN_LEN {
72            issues.push(QualityIssue::Error(format!(
73                "pattern {} regex is too large ({} bytes > {} byte limit)",
74                i,
75                pat.regex.len(),
76                MAX_REGEX_PATTERN_LEN
77            )));
78            continue;
79        }
80
81        match ast::parse::Parser::new().parse(&pat.regex) {
82            Ok(ast) => validate_regex_complexity(i, &ast, issues),
83            Err(_) => issues.push(QualityIssue::Error(format!(
84                "pattern {} regex does not compile: {}",
85                i, pat.regex
86            ))),
87        }
88    }
89}
90
91fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
92    if spec.keywords.is_empty() {
93        issues.push(QualityIssue::Warning(
94            "no keywords defined — pattern may produce false positives".into(),
95        ));
96    }
97}
98
99fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
100    for (i, pat) in spec.patterns.iter().enumerate() {
101        let has_prefix = has_literal_prefix(&pat.regex, 3);
102        let has_group = pat.group.is_some();
103        let is_pure_charclass = is_pure_character_class(&pat.regex);
104
105        if is_pure_charclass && !has_group {
106            issues.push(QualityIssue::Error(format!(
107                "pattern {} is a pure character class ({}) — too broad without context anchoring. \
108                 Use a capture group or add a literal prefix.",
109                i, pat.regex
110            )));
111        } else if !has_prefix && !has_group && spec.keywords.is_empty() {
112            issues.push(QualityIssue::Warning(format!(
113                "pattern {} has no literal prefix and no capture group — may false-positive",
114                i
115            )));
116        }
117    }
118}
119
120fn validate_companion(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
121    if let Some(companion) = &spec.companion
122        && companion.name.trim().is_empty()
123    {
124        issues.push(QualityIssue::Error(
125            "companion name must not be empty".into(),
126        ));
127    }
128}
129
130fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
131    if let Some(ref verify) = spec.verify {
132        if verify.url.is_empty() {
133            issues.push(QualityIssue::Error("verify URL is empty".into()));
134        }
135        if verify.url.starts_with("http://") && !verify.url.contains("localhost") {
136            issues.push(QualityIssue::Warning(
137                "verify URL uses HTTP instead of HTTPS".into(),
138            ));
139        }
140    }
141}
142
143fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
144    let mut count = 0;
145    for ch in pattern.chars() {
146        match ch {
147            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
148            _ => count += 1,
149        }
150    }
151    count >= min_len
152}
153
154fn is_pure_character_class(pattern: &str) -> bool {
155    let trimmed = pattern.trim();
156    if !trimmed.starts_with('[') {
157        return false;
158    }
159
160    let Some(close) = trimmed.find(']') else {
161        return false;
162    };
163    let remainder = trimmed[close + 1..].trim();
164    if remainder.is_empty() {
165        return true;
166    }
167    if remainder == "+" || remainder == "*" || remainder == "?" {
168        return true;
169    }
170    if remainder.starts_with('{')
171        && let Some(qclose) = remainder.find('}')
172    {
173        let after_quantifier = remainder[qclose + 1..].trim();
174        return after_quantifier.is_empty();
175    }
176
177    false
178}
179
180fn validate_regex_complexity(index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
181    let mut stats = RegexComplexityStats::default();
182    collect_regex_complexity(ast, &mut stats);
183    collect_redos_risks(ast, &mut stats, false);
184
185    if stats.nodes > MAX_REGEX_AST_NODES {
186        issues.push(QualityIssue::Error(format!(
187            "pattern {} regex is too complex ({} AST nodes > {} limit)",
188            index, stats.nodes, MAX_REGEX_AST_NODES
189        )));
190    }
191
192    if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
193        issues.push(QualityIssue::Error(format!(
194            "pattern {} regex has too many alternation branches ({} > {} limit)",
195            index, stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
196        )));
197    }
198
199    if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
200        issues.push(QualityIssue::Error(format!(
201            "pattern {} regex has an excessive counted repetition bound ({} > {} limit)",
202            index, stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
203        )));
204    }
205
206    if stats.has_nested_quantifier {
207        issues.push(QualityIssue::Error(format!(
208            "pattern {} regex contains nested quantifiers that can trigger pathological matching",
209            index
210        )));
211    }
212
213    if stats.has_quantified_overlapping_alternation {
214        issues.push(QualityIssue::Error(format!(
215            "pattern {} regex repeats overlapping alternations; use unambiguous branches instead",
216            index
217        )));
218    }
219}
220
221#[derive(Default)]
222struct RegexComplexityStats {
223    nodes: usize,
224    max_alternation_branches: usize,
225    max_repeat_bound: u32,
226    has_nested_quantifier: bool,
227    has_quantified_overlapping_alternation: bool,
228}
229
230fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
231    stats.nodes += 1;
232    match ast {
233        Ast::Repetition(repetition) => {
234            update_repeat_bound(&repetition.op.kind, stats);
235            collect_regex_complexity(&repetition.ast, stats);
236        }
237        Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
238        Ast::Alternation(alternation) => {
239            stats.max_alternation_branches =
240                stats.max_alternation_branches.max(alternation.asts.len());
241            for ast in &alternation.asts {
242                collect_regex_complexity(ast, stats);
243            }
244        }
245        Ast::Concat(concat) => {
246            for ast in &concat.asts {
247                collect_regex_complexity(ast, stats);
248            }
249        }
250        Ast::Empty(_)
251        | Ast::Flags(_)
252        | Ast::Literal(_)
253        | Ast::Dot(_)
254        | Ast::Assertion(_)
255        | Ast::ClassUnicode(_)
256        | Ast::ClassPerl(_)
257        | Ast::ClassBracketed(_) => {}
258    }
259}
260
261fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
262    match ast {
263        Ast::Repetition(repetition) => {
264            // Flag nested quantifiers only when they can cause exponential backtracking.
265            //
266            // SAFE patterns (char class quantifier inside group quantifier):
267            //   (?:api[_\s.-]*)? — [_\s.-]* is atomic, can't overlap
268            //   (?:key|token)[=:\s"']+  — char class quantifier, deterministic
269            //
270            // DANGEROUS patterns (group/concat quantifier inside quantifier):
271            //   (a+)+       — classic ReDoS
272            //   (\w+\s*)+   — overlapping quantifiers on non-atomic elements
273            //
274            // Strategy: only flag when THIS repetition wraps a non-atomic element
275            // AND we're inside another repetition, OR when our inner AST itself
276            // contains a nested repetition wrapping a non-atomic element.
277            let this_is_simple_atom = matches!(
278                &*repetition.ast,
279                Ast::Literal(_)
280                    | Ast::Dot(_)
281                    | Ast::ClassBracketed(_)
282                    | Ast::ClassPerl(_)
283                    | Ast::ClassUnicode(_)
284            );
285            let this_is_unbounded = matches!(
286                repetition.op.kind,
287                ast::RepetitionKind::ZeroOrMore
288                    | ast::RepetitionKind::OneOrMore
289                    | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
290            );
291            // Only flag when BOTH the outer and this repetition are unbounded
292            // and this wraps a non-atomic element. (?:group)? is safe because
293            // ? is {0,1} — it can't cause exponential backtracking.
294            if inside_repetition && !this_is_simple_atom && this_is_unbounded {
295                stats.has_nested_quantifier = true;
296            }
297            if !inside_repetition
298                && this_is_unbounded
299                && !this_is_simple_atom
300                && ast_contains_repetition(&repetition.ast)
301            {
302                stats.has_nested_quantifier = true;
303            }
304            if alternation_has_overlapping_prefixes(&repetition.ast) {
305                stats.has_quantified_overlapping_alternation = true;
306            }
307            // Only propagate inside_repetition when this is unbounded
308            collect_redos_risks(
309                &repetition.ast,
310                stats,
311                inside_repetition || this_is_unbounded,
312            );
313        }
314        Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
315        Ast::Alternation(alternation) => {
316            for ast in &alternation.asts {
317                collect_redos_risks(ast, stats, inside_repetition);
318            }
319        }
320        Ast::Concat(concat) => {
321            for ast in &concat.asts {
322                collect_redos_risks(ast, stats, inside_repetition);
323            }
324        }
325        Ast::Empty(_)
326        | Ast::Flags(_)
327        | Ast::Literal(_)
328        | Ast::Dot(_)
329        | Ast::Assertion(_)
330        | Ast::ClassUnicode(_)
331        | Ast::ClassPerl(_)
332        | Ast::ClassBracketed(_) => {}
333    }
334}
335
336fn ast_contains_repetition(ast: &Ast) -> bool {
337    match ast {
338        Ast::Repetition(_) => true,
339        Ast::Group(group) => ast_contains_repetition(&group.ast),
340        Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
341        Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
342        Ast::Empty(_)
343        | Ast::Flags(_)
344        | Ast::Literal(_)
345        | Ast::Dot(_)
346        | Ast::Assertion(_)
347        | Ast::ClassUnicode(_)
348        | Ast::ClassPerl(_)
349        | Ast::ClassBracketed(_) => false,
350    }
351}
352
353fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
354    let alternatives = match ast {
355        Ast::Alternation(alternation) => &alternation.asts,
356        Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
357        _ => return false,
358    };
359
360    let prefixes = alternatives
361        .iter()
362        .filter_map(literalish_prefix)
363        .collect::<Vec<_>>();
364    for (idx, prefix) in prefixes.iter().enumerate() {
365        for other in prefixes.iter().skip(idx + 1) {
366            if prefix.starts_with(other) || other.starts_with(prefix) {
367                return true;
368            }
369        }
370    }
371    false
372}
373
374fn literalish_prefix(ast: &Ast) -> Option<String> {
375    match ast {
376        Ast::Literal(literal) => Some(literal.c.to_string()),
377        Ast::Concat(concat) => {
378            let mut prefix = String::new();
379            for node in &concat.asts {
380                match node {
381                    Ast::Literal(literal) => prefix.push(literal.c),
382                    Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
383                    _ => break,
384                }
385            }
386            (!prefix.is_empty()).then_some(prefix)
387        }
388        Ast::Group(group) => literalish_prefix(&group.ast),
389        _ => None,
390    }
391}
392
393fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
394    let bound = match kind {
395        ast::RepetitionKind::ZeroOrOne => 1,
396        ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
397        ast::RepetitionKind::Range(range) => match range {
398            ast::RepetitionRange::Exactly(max)
399            | ast::RepetitionRange::AtLeast(max)
400            | ast::RepetitionRange::Bounded(_, max) => *max,
401        },
402    };
403    stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use crate::Severity;
410
411    fn detector_with_pattern(regex: &str) -> DetectorSpec {
412        DetectorSpec {
413            id: "test-detector".into(),
414            name: "Test Detector".into(),
415            service: "test".into(),
416            severity: Severity::High,
417            keywords: vec!["token".into()],
418            patterns: vec![crate::PatternSpec {
419                regex: regex.into(),
420                description: None,
421                group: None,
422            }],
423            verify: None,
424            companion: None,
425        }
426    }
427
428    #[test]
429    fn rejects_excessive_alternation_fanout() {
430        let regex = (0..65)
431            .map(|i| format!("opt{i}"))
432            .collect::<Vec<_>>()
433            .join("|");
434        let issues = validate_detector(&detector_with_pattern(&regex));
435
436        assert!(issues.iter().any(|issue| matches!(
437            issue,
438            QualityIssue::Error(message) if message.contains("alternation branches")
439        )));
440    }
441
442    #[test]
443    fn rejects_excessive_counted_repetition() {
444        let issues = validate_detector(&detector_with_pattern("token[a-z]{10001}"));
445
446        assert!(issues.iter().any(|issue| matches!(
447            issue,
448            QualityIssue::Error(message) if message.contains("counted repetition bound")
449        )));
450    }
451
452    #[test]
453    fn rejects_nested_quantifiers() {
454        let issues = validate_detector(&detector_with_pattern("(a+)+b"));
455
456        assert!(issues.iter().any(|issue| matches!(
457            issue,
458            QualityIssue::Error(message) if message.contains("nested quantifiers")
459        )));
460    }
461
462    #[test]
463    fn rejects_quantified_overlapping_alternation() {
464        let issues = validate_detector(&detector_with_pattern("(ab|a)+z"));
465
466        assert!(issues.iter().any(|issue| matches!(
467            issue,
468            QualityIssue::Error(message) if message.contains("overlapping alternations")
469        )));
470    }
471}