Skip to main content

keyhog_core/spec/
validate.rs

1//! Detector quality gate validation rules used while loading TOML specs.
2
3use super::DetectorSpec;
4use regex_syntax::ast::{self, Ast};
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8const MAX_REGEX_AST_NODES: usize = 512;
9const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
10const MAX_REGEX_REPEAT_BOUND: u32 = 1_000;
11
12/// Quality issue found in a detector spec.
13///
14/// # Examples
15///
16/// ```rust
17/// use keyhog_core::QualityIssue;
18///
19/// let issue = QualityIssue::Warning("add keywords".into());
20/// assert!(matches!(issue, QualityIssue::Warning(_)));
21/// ```
22#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub enum QualityIssue {
24    Error(String),
25    Warning(String),
26}
27
28/// Validate a detector spec against the quality gate.
29///
30/// # Examples
31///
32/// ```rust
33/// use keyhog_core::{DetectorSpec, PatternSpec, Severity, validate_detector};
34///
35/// let detector = DetectorSpec {
36///     id: "demo".into(),
37///     name: "Demo".into(),
38///     service: "demo".into(),
39///     severity: Severity::High,
40///     patterns: vec![PatternSpec {
41///         regex: "demo_[A-Z0-9]{8}".into(),
42///         description: None,
43///         group: None,
44///     }],
45///     companions: Vec::new(),
46///     verify: None,
47///     keywords: vec!["demo_".into()],
48/// };
49///
50/// assert!(validate_detector(&detector).is_empty());
51/// ```
52pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
53    let mut issues = Vec::new();
54    validate_patterns_present(spec, &mut issues);
55    validate_regexes(spec, &mut issues);
56    validate_keywords(spec, &mut issues);
57    validate_pattern_specificity(spec, &mut issues);
58    validate_companions(spec, &mut issues);
59    validate_verify_spec(spec, &mut issues);
60    issues
61}
62
63fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
64    if spec.patterns.is_empty() {
65        issues.push(QualityIssue::Error("no patterns defined".into()));
66    }
67}
68
69fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
70    for (i, pat) in spec.patterns.iter().enumerate() {
71        validate_regex_definition("pattern", i, &pat.regex, issues);
72    }
73}
74
75fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
76    if spec.keywords.is_empty() {
77        issues.push(QualityIssue::Warning(
78            "no keywords defined — pattern may produce false positives".into(),
79        ));
80    }
81}
82
83fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
84    for (i, pat) in spec.patterns.iter().enumerate() {
85        let has_prefix = has_literal_prefix(&pat.regex, 3);
86        let has_group = pat.group.is_some();
87        let is_pure_charclass = is_pure_character_class(&pat.regex);
88
89        if is_pure_charclass && !has_group {
90            issues.push(QualityIssue::Error(format!(
91                "pattern {} is a pure character class ({}) — too broad without context anchoring. \
92                 Use a capture group or add a literal prefix.",
93                i, pat.regex
94            )));
95        } else if !has_prefix && !has_group && spec.keywords.is_empty() {
96            issues.push(QualityIssue::Warning(format!(
97                "pattern {} has no literal prefix and no capture group — may false-positive",
98                i
99            )));
100        }
101    }
102}
103
104fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
105    for (i, companion) in spec.companions.iter().enumerate() {
106        if companion.name.trim().is_empty() {
107            issues.push(QualityIssue::Error(format!(
108                "companion {} name must not be empty",
109                i
110            )));
111        }
112        validate_regex_definition("companion", i, &companion.regex, issues);
113        // A "pure character class" companion (e.g. `[A-Z0-9]{10}` for an
114        // Algolia application_id) is acceptable when `within_lines` is small:
115        // the positional constraint is itself the contextual anchor. Reject
116        // only when the companion permits a wide search radius — at that
117        // point the lack of textual context really does over-fire.
118        if is_pure_character_class(&companion.regex) {
119            if companion.within_lines <= TIGHT_COMPANION_RADIUS {
120                issues.push(QualityIssue::Warning(format!(
121                    "companion {} regex '{}' is a pure character class; \
122                     allowed because within_lines={} ≤ {} (positional anchoring).",
123                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
124                )));
125            } else {
126                issues.push(QualityIssue::Error(format!(
127                    "companion {} regex '{}' is a pure character class with within_lines={} \
128                     (> {}) — the wide search radius needs a literal context anchor",
129                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
130                )));
131            }
132        } else if !has_substantial_literal(&companion.regex, 3) {
133            issues.push(QualityIssue::Warning(format!(
134                "companion {} regex '{}' is too broad — may produce false positives. \
135                 Add a context anchor like 'KEY_NAME='.",
136                i, companion.regex
137            )));
138        }
139    }
140}
141
142/// Companion search radius (in lines) below which a pure character-class
143/// regex is acceptable. The positional bound provides the context anchor.
144const TIGHT_COMPANION_RADIUS: usize = 5;
145
146fn validate_regex_definition(
147    kind: &str,
148    index: usize,
149    regex: &str,
150    issues: &mut Vec<QualityIssue>,
151) {
152    if regex.len() > MAX_REGEX_PATTERN_LEN {
153        issues.push(QualityIssue::Error(format!(
154            "{kind} {index} regex is too large ({} bytes > {} byte limit)",
155            regex.len(),
156            MAX_REGEX_PATTERN_LEN
157        )));
158        return;
159    }
160
161    match ast::parse::Parser::new().parse(regex) {
162        Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
163        Err(error) => issues.push(QualityIssue::Error(format!(
164            "{kind} {index} regex does not compile: {error}"
165        ))),
166    }
167}
168
169fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
170    let mut max_literal_len = 0;
171    let mut current_literal_len = 0;
172    let mut in_escape = false;
173    let mut in_char_class = false;
174
175    for ch in pattern.chars() {
176        if in_escape {
177            if is_escaped_literal(ch) {
178                current_literal_len += 1;
179            } else {
180                max_literal_len = max_literal_len.max(current_literal_len);
181                current_literal_len = 0;
182            }
183            in_escape = false;
184            continue;
185        }
186
187        match ch {
188            '\\' => in_escape = true,
189            '[' => {
190                max_literal_len = max_literal_len.max(current_literal_len);
191                current_literal_len = 0;
192                in_char_class = true;
193            }
194            ']' => {
195                in_char_class = false;
196            }
197            '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
198                max_literal_len = max_literal_len.max(current_literal_len);
199                current_literal_len = 0;
200            }
201            _ => {
202                if !in_char_class {
203                    current_literal_len += 1;
204                }
205            }
206        }
207    }
208    max_literal_len = max_literal_len.max(current_literal_len);
209    max_literal_len >= min_len
210}
211
212fn is_escaped_literal(ch: char) -> bool {
213    matches!(
214        ch,
215        '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
216    )
217}
218
219fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
220    if let Some(ref verify) = spec.verify {
221        // verify.service defaults to the detector's service — empty is fine
222        if !verify.steps.is_empty() {
223            for step in &verify.steps {
224                validate_url(&step.url, issues);
225                check_url_exfil_risk(&step.url, &verify.allowed_domains, issues);
226            }
227        } else if let Some(ref url) = verify.url {
228            validate_url(url, issues);
229            check_url_exfil_risk(url, &verify.allowed_domains, issues);
230        } else {
231            issues.push(QualityIssue::Error(
232                "verify spec has no steps and no default URL".into(),
233            ));
234        }
235    }
236}
237
238/// Catch detectors whose `verify.url` is built from interpolation tokens
239/// without a fixed authoritative host AND without an explicit
240/// `allowed_domains` list. The verifier's runtime domain allowlist
241/// catches these at request time, but flagging at load time gives the
242/// detector author actionable feedback before the rule ships.
243/// kimi-wave3 §1 + §1.HIGH (single-brace `{var}` and `{{shop}}` cases).
244fn check_url_exfil_risk(url: &str, allowed_domains: &[String], issues: &mut Vec<QualityIssue>) {
245    // Detect `{{match}}` or `{{companion.*}}` taking the place of the
246    // authority component of the URL. Conservative match: anything that
247    // starts with the templated host (e.g. `https://{{...}}`, plain
248    // `{{match}}`, `https://{{...}}/path`).
249    let trimmed = url.trim();
250    let after_scheme = trimmed
251        .strip_prefix("https://")
252        .or_else(|| trimmed.strip_prefix("http://"))
253        .unwrap_or(trimmed);
254    let host_starts_with_template =
255        after_scheme.starts_with("{{") || after_scheme.starts_with("{") || trimmed == "{{match}}";
256    if host_starts_with_template && allowed_domains.is_empty() {
257        issues.push(QualityIssue::Error(
258            "verify URL host is templated and no `allowed_domains` is set — \
259             attacker-controlled interpolation could exfil credentials. \
260             Either hardcode the authoritative host in the URL or set \
261             `allowed_domains` explicitly. See kimi-wave3 §1."
262                .into(),
263        ));
264    }
265    // Single-brace `{name}` is a common author error — interpolate.rs
266    // only handles `{{...}}`, so `{name}` lands in the URL literally.
267    if url.contains('{') && !url.contains("{{") {
268        issues.push(QualityIssue::Error(
269            "verify URL uses single-brace `{var}` template syntax which the \
270             interpolator does NOT honor (only `{{var}}` works); the URL will \
271             be sent to a literal-string host. Use `{{companion.var}}`."
272                .into(),
273        ));
274    }
275}
276
277fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
278    if url.is_empty() {
279        issues.push(QualityIssue::Error("verify URL is empty".into()));
280    }
281    if url.starts_with("http://") && !url.contains("localhost") {
282        issues.push(QualityIssue::Warning(
283            "verify URL uses HTTP instead of HTTPS".into(),
284        ));
285    }
286}
287
288fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
289    let mut count = 0;
290    for ch in pattern.chars() {
291        match ch {
292            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
293            _ => count += 1,
294        }
295    }
296    count >= min_len
297}
298
299fn is_pure_character_class(pattern: &str) -> bool {
300    let trimmed = pattern.trim();
301    if !trimmed.starts_with('[') {
302        return false;
303    }
304
305    let Some(close) = trimmed.find(']') else {
306        return false;
307    };
308    let remainder = trimmed[close + 1..].trim();
309    if remainder.is_empty() {
310        return true;
311    }
312    if remainder == "+" || remainder == "*" || remainder == "?" {
313        return true;
314    }
315    if remainder.starts_with('{') {
316        if let Some(qclose) = remainder.find('}') {
317            let after_quantifier = remainder[qclose + 1..].trim();
318            return after_quantifier.is_empty();
319        }
320    }
321
322    false
323}
324
325fn validate_regex_complexity(kind: &str, index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
326    let mut stats = RegexComplexityStats::default();
327    collect_regex_complexity(ast, &mut stats);
328    collect_redos_risks(ast, &mut stats, false);
329
330    if stats.nodes > MAX_REGEX_AST_NODES {
331        issues.push(QualityIssue::Error(format!(
332            "{kind} {index} regex is too complex ({} AST nodes > {} limit)",
333            stats.nodes, MAX_REGEX_AST_NODES
334        )));
335    }
336
337    if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
338        issues.push(QualityIssue::Error(format!(
339            "{kind} {index} regex has too many alternation branches ({} > {} limit)",
340            stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
341        )));
342    }
343
344    if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
345        issues.push(QualityIssue::Error(format!(
346            "{kind} {index} regex has an excessive counted repetition bound ({} > {} limit)",
347            stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
348        )));
349    }
350
351    if stats.has_nested_quantifier {
352        issues.push(QualityIssue::Error(format!(
353            "{kind} {index} regex contains nested quantifiers that can trigger pathological matching"
354        )));
355    }
356
357    if stats.has_quantified_overlapping_alternation {
358        issues.push(QualityIssue::Error(format!(
359            "{kind} {index} regex repeats overlapping alternations; use unambiguous branches instead"
360        )));
361    }
362}
363
364#[derive(Default)]
365struct RegexComplexityStats {
366    nodes: usize,
367    max_alternation_branches: usize,
368    max_repeat_bound: u32,
369    has_nested_quantifier: bool,
370    has_quantified_overlapping_alternation: bool,
371}
372
373fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
374    stats.nodes += 1;
375    match ast {
376        Ast::Repetition(repetition) => {
377            update_repeat_bound(&repetition.op.kind, stats);
378            collect_regex_complexity(&repetition.ast, stats);
379        }
380        Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
381        Ast::Alternation(alternation) => {
382            stats.max_alternation_branches =
383                stats.max_alternation_branches.max(alternation.asts.len());
384            for ast in &alternation.asts {
385                collect_regex_complexity(ast, stats);
386            }
387        }
388        Ast::Concat(concat) => {
389            for ast in &concat.asts {
390                collect_regex_complexity(ast, stats);
391            }
392        }
393        Ast::Empty(_)
394        | Ast::Flags(_)
395        | Ast::Literal(_)
396        | Ast::Dot(_)
397        | Ast::Assertion(_)
398        | Ast::ClassUnicode(_)
399        | Ast::ClassPerl(_)
400        | Ast::ClassBracketed(_) => {}
401    }
402}
403
404fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
405    match ast {
406        Ast::Repetition(repetition) => {
407            // Flag nested quantifiers only when they can cause exponential backtracking.
408            //
409            // SAFE patterns (char class quantifier inside group quantifier):
410            //   (?:api[_\s.-]*)? — [_\s.-]* is atomic, can't overlap
411            //   (?:key|token)[=:\s"']+  — char class quantifier, deterministic
412            //
413            // DANGEROUS patterns (group/concat quantifier inside quantifier):
414            //   (a+)+       — classic ReDoS
415            //   (\w+\s*)+   — overlapping quantifiers on non-atomic elements
416            //
417            // Strategy: only flag when THIS repetition wraps a non-atomic element
418            // AND we're inside another repetition, OR when our inner AST itself
419            // contains a nested repetition wrapping a non-atomic element.
420            let this_is_simple_atom = matches!(
421                &*repetition.ast,
422                Ast::Literal(_)
423                    | Ast::Dot(_)
424                    | Ast::ClassBracketed(_)
425                    | Ast::ClassPerl(_)
426                    | Ast::ClassUnicode(_)
427            );
428            let this_is_unbounded = matches!(
429                repetition.op.kind,
430                ast::RepetitionKind::ZeroOrMore
431                    | ast::RepetitionKind::OneOrMore
432                    | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
433            );
434            // Only flag when BOTH the outer and this repetition are unbounded
435            // and this wraps a non-atomic element. (?:group)? is safe because
436            // ? is {0,1} — it can't cause exponential backtracking.
437            if inside_repetition && !this_is_simple_atom && this_is_unbounded {
438                stats.has_nested_quantifier = true;
439            }
440            if !inside_repetition
441                && this_is_unbounded
442                && !this_is_simple_atom
443                && ast_contains_repetition(&repetition.ast)
444            {
445                stats.has_nested_quantifier = true;
446            }
447            if alternation_has_overlapping_prefixes(&repetition.ast) {
448                stats.has_quantified_overlapping_alternation = true;
449            }
450            // Only propagate inside_repetition when this is unbounded
451            collect_redos_risks(
452                &repetition.ast,
453                stats,
454                inside_repetition || this_is_unbounded,
455            );
456        }
457        Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
458        Ast::Alternation(alternation) => {
459            for ast in &alternation.asts {
460                collect_redos_risks(ast, stats, inside_repetition);
461            }
462        }
463        Ast::Concat(concat) => {
464            for ast in &concat.asts {
465                collect_redos_risks(ast, stats, inside_repetition);
466            }
467        }
468        Ast::Empty(_)
469        | Ast::Flags(_)
470        | Ast::Literal(_)
471        | Ast::Dot(_)
472        | Ast::Assertion(_)
473        | Ast::ClassUnicode(_)
474        | Ast::ClassPerl(_)
475        | Ast::ClassBracketed(_) => {}
476    }
477}
478
479fn ast_contains_repetition(ast: &Ast) -> bool {
480    match ast {
481        Ast::Repetition(_) => true,
482        Ast::Group(group) => ast_contains_repetition(&group.ast),
483        Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
484        Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
485        Ast::Empty(_)
486        | Ast::Flags(_)
487        | Ast::Literal(_)
488        | Ast::Dot(_)
489        | Ast::Assertion(_)
490        | Ast::ClassUnicode(_)
491        | Ast::ClassPerl(_)
492        | Ast::ClassBracketed(_) => false,
493    }
494}
495
496fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
497    let alternatives = match ast {
498        Ast::Alternation(alternation) => &alternation.asts,
499        Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
500        _ => return false,
501    };
502
503    let prefixes = alternatives
504        .iter()
505        .filter_map(literalish_prefix)
506        .collect::<Vec<_>>();
507    for (idx, prefix) in prefixes.iter().enumerate() {
508        for other in prefixes.iter().skip(idx + 1) {
509            if prefix.starts_with(other) || other.starts_with(prefix) {
510                return true;
511            }
512        }
513    }
514    false
515}
516
517fn literalish_prefix(ast: &Ast) -> Option<String> {
518    match ast {
519        Ast::Literal(literal) => Some(literal.c.to_string()),
520        Ast::Concat(concat) => {
521            let mut prefix = String::new();
522            for node in &concat.asts {
523                match node {
524                    Ast::Literal(literal) => prefix.push(literal.c),
525                    Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
526                    _ => break,
527                }
528            }
529            (!prefix.is_empty()).then_some(prefix)
530        }
531        Ast::Group(group) => literalish_prefix(&group.ast),
532        _ => None,
533    }
534}
535
536fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
537    let bound = match kind {
538        ast::RepetitionKind::ZeroOrOne => 1,
539        ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
540        ast::RepetitionKind::Range(range) => match range {
541            ast::RepetitionRange::Exactly(max)
542            | ast::RepetitionRange::AtLeast(max)
543            | ast::RepetitionRange::Bounded(_, max) => *max,
544        },
545    };
546    stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
547}