Skip to main content

keyhog_core/spec/
validate.rs

1//! Detector quality gate validation rules used while loading TOML specs.
2
3use super::DetectorSpec;
4use regex_syntax::ast::{self, Ast};
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8const MAX_REGEX_AST_NODES: usize = 512;
9const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
10const MAX_REGEX_REPEAT_BOUND: u32 = 1_000;
11
12/// Quality issue found in a detector spec.
13///
14/// # Examples
15///
16/// ```rust
17/// use keyhog_core::QualityIssue;
18///
19/// let issue = QualityIssue::Warning("add keywords".into());
20/// assert!(matches!(issue, QualityIssue::Warning(_)));
21/// ```
22#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub enum QualityIssue {
24    Error(String),
25    Warning(String),
26}
27
28/// Validate a detector spec against the quality gate.
29///
30/// # Examples
31///
32/// ```rust
33/// use keyhog_core::{DetectorSpec, PatternSpec, Severity, validate_detector};
34///
35/// let detector = DetectorSpec {
36///     id: "demo".into(),
37///     name: "Demo".into(),
38///     service: "demo".into(),
39///     severity: Severity::High,
40///     patterns: vec![PatternSpec {
41///         regex: "demo_[A-Z0-9]{8}".into(),
42///         description: None,
43///         group: None,
44///     }],
45///     companions: Vec::new(),
46///     verify: None,
47///     keywords: vec!["demo_".into()],
48/// };
49///
50/// assert!(validate_detector(&detector).is_empty());
51/// ```
52pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
53    let mut issues = Vec::new();
54    validate_patterns_present(spec, &mut issues);
55    validate_regexes(spec, &mut issues);
56    validate_keywords(spec, &mut issues);
57    validate_pattern_specificity(spec, &mut issues);
58    validate_companions(spec, &mut issues);
59    validate_verify_spec(spec, &mut issues);
60    issues
61}
62
63fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
64    if spec.patterns.is_empty() {
65        issues.push(QualityIssue::Error("no patterns defined".into()));
66    }
67}
68
69fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
70    for (i, pat) in spec.patterns.iter().enumerate() {
71        validate_regex_definition("pattern", i, &pat.regex, issues);
72    }
73}
74
75fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
76    if spec.keywords.is_empty() {
77        issues.push(QualityIssue::Warning(
78            "no keywords defined — pattern may produce false positives".into(),
79        ));
80    }
81}
82
83fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
84    for (i, pat) in spec.patterns.iter().enumerate() {
85        let has_prefix = has_literal_prefix(&pat.regex, 3);
86        let has_group = pat.group.is_some();
87        let is_pure_charclass = is_pure_character_class(&pat.regex);
88
89        if is_pure_charclass && !has_group {
90            issues.push(QualityIssue::Error(format!(
91                "pattern {} is a pure character class ({}) — too broad without context anchoring. \
92                 Use a capture group or add a literal prefix.",
93                i, pat.regex
94            )));
95        } else if !has_prefix && !has_group && spec.keywords.is_empty() {
96            issues.push(QualityIssue::Warning(format!(
97                "pattern {} has no literal prefix and no capture group — may false-positive",
98                i
99            )));
100        }
101    }
102}
103
104fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
105    for (i, companion) in spec.companions.iter().enumerate() {
106        if companion.name.trim().is_empty() {
107            issues.push(QualityIssue::Error(format!(
108                "companion {} name must not be empty",
109                i
110            )));
111        }
112        validate_regex_definition("companion", i, &companion.regex, issues);
113        if is_pure_character_class(&companion.regex) {
114            issues.push(QualityIssue::Error(format!(
115                "companion {} regex '{}' is a pure character class — add a literal context anchor",
116                i, companion.regex
117            )));
118        } else if !has_substantial_literal(&companion.regex, 3) {
119            issues.push(QualityIssue::Warning(format!(
120                "companion {} regex '{}' is too broad — may produce false positives. \
121                 Add a context anchor like 'KEY_NAME='.",
122                i, companion.regex
123            )));
124        }
125    }
126}
127
128fn validate_regex_definition(
129    kind: &str,
130    index: usize,
131    regex: &str,
132    issues: &mut Vec<QualityIssue>,
133) {
134    if regex.len() > MAX_REGEX_PATTERN_LEN {
135        issues.push(QualityIssue::Error(format!(
136            "{kind} {index} regex is too large ({} bytes > {} byte limit)",
137            regex.len(),
138            MAX_REGEX_PATTERN_LEN
139        )));
140        return;
141    }
142
143    match ast::parse::Parser::new().parse(regex) {
144        Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
145        Err(error) => issues.push(QualityIssue::Error(format!(
146            "{kind} {index} regex does not compile: {error}"
147        ))),
148    }
149}
150
151fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
152    let mut max_literal_len = 0;
153    let mut current_literal_len = 0;
154    let mut in_escape = false;
155    let mut in_char_class = false;
156
157    for ch in pattern.chars() {
158        if in_escape {
159            if is_escaped_literal(ch) {
160                current_literal_len += 1;
161            } else {
162                max_literal_len = max_literal_len.max(current_literal_len);
163                current_literal_len = 0;
164            }
165            in_escape = false;
166            continue;
167        }
168
169        match ch {
170            '\\' => in_escape = true,
171            '[' => {
172                max_literal_len = max_literal_len.max(current_literal_len);
173                current_literal_len = 0;
174                in_char_class = true;
175            }
176            ']' => {
177                in_char_class = false;
178            }
179            '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
180                max_literal_len = max_literal_len.max(current_literal_len);
181                current_literal_len = 0;
182            }
183            _ => {
184                if !in_char_class {
185                    current_literal_len += 1;
186                }
187            }
188        }
189    }
190    max_literal_len = max_literal_len.max(current_literal_len);
191    max_literal_len >= min_len
192}
193
194fn is_escaped_literal(ch: char) -> bool {
195    matches!(
196        ch,
197        '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
198    )
199}
200
201fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
202    if let Some(ref verify) = spec.verify {
203        // verify.service defaults to the detector's service — empty is fine
204        if !verify.steps.is_empty() {
205            for step in &verify.steps {
206                validate_url(&step.url, issues);
207            }
208        } else if let Some(ref url) = verify.url {
209            validate_url(url, issues);
210        } else {
211            issues.push(QualityIssue::Error(
212                "verify spec has no steps and no default URL".into(),
213            ));
214        }
215    }
216}
217
218fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
219    if url.is_empty() {
220        issues.push(QualityIssue::Error("verify URL is empty".into()));
221    }
222    if url.starts_with("http://") && !url.contains("localhost") {
223        issues.push(QualityIssue::Warning(
224            "verify URL uses HTTP instead of HTTPS".into(),
225        ));
226    }
227}
228
229fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
230    let mut count = 0;
231    for ch in pattern.chars() {
232        match ch {
233            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
234            _ => count += 1,
235        }
236    }
237    count >= min_len
238}
239
240fn is_pure_character_class(pattern: &str) -> bool {
241    let trimmed = pattern.trim();
242    if !trimmed.starts_with('[') {
243        return false;
244    }
245
246    let Some(close) = trimmed.find(']') else {
247        return false;
248    };
249    let remainder = trimmed[close + 1..].trim();
250    if remainder.is_empty() {
251        return true;
252    }
253    if remainder == "+" || remainder == "*" || remainder == "?" {
254        return true;
255    }
256    if remainder.starts_with('{')
257        && let Some(qclose) = remainder.find('}')
258    {
259        let after_quantifier = remainder[qclose + 1..].trim();
260        return after_quantifier.is_empty();
261    }
262
263    false
264}
265
266fn validate_regex_complexity(kind: &str, index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
267    let mut stats = RegexComplexityStats::default();
268    collect_regex_complexity(ast, &mut stats);
269    collect_redos_risks(ast, &mut stats, false);
270
271    if stats.nodes > MAX_REGEX_AST_NODES {
272        issues.push(QualityIssue::Error(format!(
273            "{kind} {index} regex is too complex ({} AST nodes > {} limit)",
274            stats.nodes, MAX_REGEX_AST_NODES
275        )));
276    }
277
278    if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
279        issues.push(QualityIssue::Error(format!(
280            "{kind} {index} regex has too many alternation branches ({} > {} limit)",
281            stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
282        )));
283    }
284
285    if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
286        issues.push(QualityIssue::Error(format!(
287            "{kind} {index} regex has an excessive counted repetition bound ({} > {} limit)",
288            stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
289        )));
290    }
291
292    if stats.has_nested_quantifier {
293        issues.push(QualityIssue::Error(format!(
294            "{kind} {index} regex contains nested quantifiers that can trigger pathological matching"
295        )));
296    }
297
298    if stats.has_quantified_overlapping_alternation {
299        issues.push(QualityIssue::Error(format!(
300            "{kind} {index} regex repeats overlapping alternations; use unambiguous branches instead"
301        )));
302    }
303}
304
305#[derive(Default)]
306struct RegexComplexityStats {
307    nodes: usize,
308    max_alternation_branches: usize,
309    max_repeat_bound: u32,
310    has_nested_quantifier: bool,
311    has_quantified_overlapping_alternation: bool,
312}
313
314fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
315    stats.nodes += 1;
316    match ast {
317        Ast::Repetition(repetition) => {
318            update_repeat_bound(&repetition.op.kind, stats);
319            collect_regex_complexity(&repetition.ast, stats);
320        }
321        Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
322        Ast::Alternation(alternation) => {
323            stats.max_alternation_branches =
324                stats.max_alternation_branches.max(alternation.asts.len());
325            for ast in &alternation.asts {
326                collect_regex_complexity(ast, stats);
327            }
328        }
329        Ast::Concat(concat) => {
330            for ast in &concat.asts {
331                collect_regex_complexity(ast, stats);
332            }
333        }
334        Ast::Empty(_)
335        | Ast::Flags(_)
336        | Ast::Literal(_)
337        | Ast::Dot(_)
338        | Ast::Assertion(_)
339        | Ast::ClassUnicode(_)
340        | Ast::ClassPerl(_)
341        | Ast::ClassBracketed(_) => {}
342    }
343}
344
345fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
346    match ast {
347        Ast::Repetition(repetition) => {
348            // Flag nested quantifiers only when they can cause exponential backtracking.
349            //
350            // SAFE patterns (char class quantifier inside group quantifier):
351            //   (?:api[_\s.-]*)? — [_\s.-]* is atomic, can't overlap
352            //   (?:key|token)[=:\s"']+  — char class quantifier, deterministic
353            //
354            // DANGEROUS patterns (group/concat quantifier inside quantifier):
355            //   (a+)+       — classic ReDoS
356            //   (\w+\s*)+   — overlapping quantifiers on non-atomic elements
357            //
358            // Strategy: only flag when THIS repetition wraps a non-atomic element
359            // AND we're inside another repetition, OR when our inner AST itself
360            // contains a nested repetition wrapping a non-atomic element.
361            let this_is_simple_atom = matches!(
362                &*repetition.ast,
363                Ast::Literal(_)
364                    | Ast::Dot(_)
365                    | Ast::ClassBracketed(_)
366                    | Ast::ClassPerl(_)
367                    | Ast::ClassUnicode(_)
368            );
369            let this_is_unbounded = matches!(
370                repetition.op.kind,
371                ast::RepetitionKind::ZeroOrMore
372                    | ast::RepetitionKind::OneOrMore
373                    | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
374            );
375            // Only flag when BOTH the outer and this repetition are unbounded
376            // and this wraps a non-atomic element. (?:group)? is safe because
377            // ? is {0,1} — it can't cause exponential backtracking.
378            if inside_repetition && !this_is_simple_atom && this_is_unbounded {
379                stats.has_nested_quantifier = true;
380            }
381            if !inside_repetition
382                && this_is_unbounded
383                && !this_is_simple_atom
384                && ast_contains_repetition(&repetition.ast)
385            {
386                stats.has_nested_quantifier = true;
387            }
388            if alternation_has_overlapping_prefixes(&repetition.ast) {
389                stats.has_quantified_overlapping_alternation = true;
390            }
391            // Only propagate inside_repetition when this is unbounded
392            collect_redos_risks(
393                &repetition.ast,
394                stats,
395                inside_repetition || this_is_unbounded,
396            );
397        }
398        Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
399        Ast::Alternation(alternation) => {
400            for ast in &alternation.asts {
401                collect_redos_risks(ast, stats, inside_repetition);
402            }
403        }
404        Ast::Concat(concat) => {
405            for ast in &concat.asts {
406                collect_redos_risks(ast, stats, inside_repetition);
407            }
408        }
409        Ast::Empty(_)
410        | Ast::Flags(_)
411        | Ast::Literal(_)
412        | Ast::Dot(_)
413        | Ast::Assertion(_)
414        | Ast::ClassUnicode(_)
415        | Ast::ClassPerl(_)
416        | Ast::ClassBracketed(_) => {}
417    }
418}
419
420fn ast_contains_repetition(ast: &Ast) -> bool {
421    match ast {
422        Ast::Repetition(_) => true,
423        Ast::Group(group) => ast_contains_repetition(&group.ast),
424        Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
425        Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
426        Ast::Empty(_)
427        | Ast::Flags(_)
428        | Ast::Literal(_)
429        | Ast::Dot(_)
430        | Ast::Assertion(_)
431        | Ast::ClassUnicode(_)
432        | Ast::ClassPerl(_)
433        | Ast::ClassBracketed(_) => false,
434    }
435}
436
437fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
438    let alternatives = match ast {
439        Ast::Alternation(alternation) => &alternation.asts,
440        Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
441        _ => return false,
442    };
443
444    let prefixes = alternatives
445        .iter()
446        .filter_map(literalish_prefix)
447        .collect::<Vec<_>>();
448    for (idx, prefix) in prefixes.iter().enumerate() {
449        for other in prefixes.iter().skip(idx + 1) {
450            if prefix.starts_with(other) || other.starts_with(prefix) {
451                return true;
452            }
453        }
454    }
455    false
456}
457
458fn literalish_prefix(ast: &Ast) -> Option<String> {
459    match ast {
460        Ast::Literal(literal) => Some(literal.c.to_string()),
461        Ast::Concat(concat) => {
462            let mut prefix = String::new();
463            for node in &concat.asts {
464                match node {
465                    Ast::Literal(literal) => prefix.push(literal.c),
466                    Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
467                    _ => break,
468                }
469            }
470            (!prefix.is_empty()).then_some(prefix)
471        }
472        Ast::Group(group) => literalish_prefix(&group.ast),
473        _ => None,
474    }
475}
476
477fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
478    let bound = match kind {
479        ast::RepetitionKind::ZeroOrOne => 1,
480        ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
481        ast::RepetitionKind::Range(range) => match range {
482            ast::RepetitionRange::Exactly(max)
483            | ast::RepetitionRange::AtLeast(max)
484            | ast::RepetitionRange::Bounded(_, max) => *max,
485        },
486    };
487    stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
488}
489
490#[cfg(test)]
491mod tests {
492    use super::*;
493    use crate::Severity;
494
495    fn detector_with_pattern(regex: &str) -> DetectorSpec {
496        DetectorSpec {
497            id: "test-detector".into(),
498            name: "Test Detector".into(),
499            service: "test".into(),
500            severity: Severity::High,
501            keywords: vec!["token".into()],
502            patterns: vec![crate::PatternSpec {
503                regex: regex.into(),
504                description: None,
505                group: None,
506            }],
507            verify: None,
508            companions: Vec::new(),
509        }
510    }
511
512    #[test]
513    fn rejects_excessive_alternation_fanout() {
514        let regex = (0..65)
515            .map(|i| format!("opt{i}"))
516            .collect::<Vec<_>>()
517            .join("|");
518        let issues = validate_detector(&detector_with_pattern(&regex));
519
520        assert!(issues.iter().any(|issue| matches!(
521            issue,
522            QualityIssue::Error(message) if message.contains("alternation branches")
523        )));
524    }
525
526    #[test]
527    fn rejects_excessive_counted_repetition() {
528        let issues = validate_detector(&detector_with_pattern("token[a-z]{10001}"));
529
530        assert!(issues.iter().any(|issue| matches!(
531            issue,
532            QualityIssue::Error(message) if message.contains("counted repetition bound")
533        )));
534    }
535
536    #[test]
537    fn rejects_nested_quantifiers() {
538        let issues = validate_detector(&detector_with_pattern("(a+)+b"));
539
540        assert!(issues.iter().any(|issue| matches!(
541            issue,
542            QualityIssue::Error(message) if message.contains("nested quantifiers")
543        )));
544    }
545
546    #[test]
547    fn rejects_quantified_overlapping_alternation() {
548        let issues = validate_detector(&detector_with_pattern("(ab|a)+z"));
549
550        assert!(issues.iter().any(|issue| matches!(
551            issue,
552            QualityIssue::Error(message) if message.contains("overlapping alternations")
553        )));
554    }
555
556    #[test]
557    fn rejects_invalid_companion_regexes() {
558        let mut detector = detector_with_pattern("token_[A-Z0-9]{8}");
559        detector.companions.push(crate::CompanionSpec {
560            name: "secret".into(),
561            regex: "(".into(),
562            within_lines: 3,
563            required: false,
564        });
565
566        let issues = validate_detector(&detector);
567        assert!(issues.iter().any(|issue| matches!(
568            issue,
569            QualityIssue::Error(message)
570                if message.contains("companion 0 regex does not compile")
571        )));
572    }
573
574    #[test]
575    fn rejects_broad_companion_character_class() {
576        let mut detector = detector_with_pattern("token_[A-Z0-9]{8}");
577        detector.companions.push(crate::CompanionSpec {
578            name: "secret".into(),
579            regex: "[A-Za-z0-9+/=]{40,}".into(),
580            within_lines: 3,
581            required: false,
582        });
583
584        let issues = validate_detector(&detector);
585        assert!(issues.iter().any(|issue| matches!(
586            issue,
587            QualityIssue::Error(message) if message.contains("pure character class")
588        )));
589    }
590}