keyhog_core/spec/
validate.rs

1//! Detector quality gate validation rules used while loading TOML specs.
2
3use super::{DetectorSpec, VerifySpec};
4use regex_syntax::ast::{self, Ast};
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8const MAX_REGEX_AST_NODES: usize = 512;
9const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
10const MAX_REGEX_REPEAT_BOUND: u32 = 1_000;
11
12/// Quality issue found in a detector spec.
13///
14/// # Examples
15///
16/// ```rust
17/// use keyhog_core::QualityIssue;
18///
19/// let issue = QualityIssue::Warning("add keywords".into());
20/// assert!(matches!(issue, QualityIssue::Warning(_)));
21/// ```
22#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub enum QualityIssue {
24    Error(String),
25    Warning(String),
26}
27
28/// Validate a detector spec against the quality gate.
29///
30/// # Examples
31///
32/// ```rust
33/// use keyhog_core::{DetectorSpec, PatternSpec, Severity, validate_detector};
34///
35/// let detector = DetectorSpec {
36///     id: "demo".into(),
37///     name: "Demo".into(),
38///     service: "demo".into(),
39///     severity: Severity::High,
40///     patterns: vec![PatternSpec {
41///         regex: "demo_[A-Z0-9]{8}".into(),
42///         description: None,
43///         group: None,
44///     }],
45///     companions: Vec::new(),
46///     verify: None,
47///     keywords: vec!["demo_".into()],
48/// };
49///
50/// assert!(validate_detector(&detector).is_empty());
51/// ```
52pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
53    let mut issues = Vec::new();
54    validate_patterns_present(spec, &mut issues);
55    validate_regexes(spec, &mut issues);
56    validate_keywords(spec, &mut issues);
57    validate_pattern_specificity(spec, &mut issues);
58    validate_companions(spec, &mut issues);
59    validate_verify_spec(spec, &mut issues);
60    issues
61}
62
63fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
64    if spec.patterns.is_empty() {
65        issues.push(QualityIssue::Error("no patterns defined".into()));
66    }
67}
68
69fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
70    for (i, pat) in spec.patterns.iter().enumerate() {
71        validate_regex_definition("pattern", i, &pat.regex, issues);
72    }
73}
74
75fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
76    if spec.keywords.is_empty() {
77        issues.push(QualityIssue::Warning(
78            "no keywords defined — pattern may produce false positives".into(),
79        ));
80    }
81}
82
83fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
84    for (i, pat) in spec.patterns.iter().enumerate() {
85        let has_prefix = has_literal_prefix(&pat.regex, 3);
86        let has_group = pat.group.is_some();
87        let is_pure_charclass = is_pure_character_class(&pat.regex);
88
89        if is_pure_charclass && !has_group {
90            issues.push(QualityIssue::Error(format!(
91                "pattern {} is a pure character class ({}) — too broad without context anchoring. \
92                 Use a capture group or add a literal prefix.",
93                i, pat.regex
94            )));
95        } else if !has_prefix && !has_group && spec.keywords.is_empty() {
96            issues.push(QualityIssue::Warning(format!(
97                "pattern {} has no literal prefix and no capture group — may false-positive",
98                i
99            )));
100        }
101    }
102}
103
104fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
105    for (i, companion) in spec.companions.iter().enumerate() {
106        if companion.name.trim().is_empty() {
107            issues.push(QualityIssue::Error(format!(
108                "companion {} name must not be empty",
109                i
110            )));
111        }
112        validate_regex_definition("companion", i, &companion.regex, issues);
113        // A "pure character class" companion (e.g. `[A-Z0-9]{10}` for an
114        // Algolia application_id) is acceptable when `within_lines` is small:
115        // the positional constraint is itself the contextual anchor. Reject
116        // only when the companion permits a wide search radius — at that
117        // point the lack of textual context really does over-fire.
118        if is_pure_character_class(&companion.regex) {
119            if companion.within_lines <= TIGHT_COMPANION_RADIUS {
120                issues.push(QualityIssue::Warning(format!(
121                    "companion {} regex '{}' is a pure character class; \
122                     allowed because within_lines={} ≤ {} (positional anchoring).",
123                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
124                )));
125            } else {
126                issues.push(QualityIssue::Error(format!(
127                    "companion {} regex '{}' is a pure character class with within_lines={} \
128                     (> {}) — the wide search radius needs a literal context anchor",
129                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
130                )));
131            }
132        } else if !has_substantial_literal(&companion.regex, 3) {
133            issues.push(QualityIssue::Warning(format!(
134                "companion {} regex '{}' is too broad — may produce false positives. \
135                 Add a context anchor like 'KEY_NAME='.",
136                i, companion.regex
137            )));
138        }
139    }
140}
141
142/// Companion search radius (in lines) below which a pure character-class
143/// regex is acceptable. The positional bound provides the context anchor.
144const TIGHT_COMPANION_RADIUS: usize = 5;
145
146fn validate_regex_definition(
147    kind: &str,
148    index: usize,
149    regex: &str,
150    issues: &mut Vec<QualityIssue>,
151) {
152    if regex.len() > MAX_REGEX_PATTERN_LEN {
153        issues.push(QualityIssue::Error(format!(
154            "{kind} {index} regex is too large ({} bytes > {} byte limit)",
155            regex.len(),
156            MAX_REGEX_PATTERN_LEN
157        )));
158        return;
159    }
160
161    match ast::parse::Parser::new().parse(regex) {
162        Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
163        Err(error) => issues.push(QualityIssue::Error(format!(
164            "{kind} {index} regex does not compile: {error}"
165        ))),
166    }
167}
168
169fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
170    let mut max_literal_len = 0;
171    let mut current_literal_len = 0;
172    let mut in_escape = false;
173    let mut in_char_class = false;
174
175    for ch in pattern.chars() {
176        if in_escape {
177            if is_escaped_literal(ch) {
178                current_literal_len += 1;
179            } else {
180                max_literal_len = max_literal_len.max(current_literal_len);
181                current_literal_len = 0;
182            }
183            in_escape = false;
184            continue;
185        }
186
187        match ch {
188            '\\' => in_escape = true,
189            '[' => {
190                max_literal_len = max_literal_len.max(current_literal_len);
191                current_literal_len = 0;
192                in_char_class = true;
193            }
194            ']' => {
195                in_char_class = false;
196            }
197            '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
198                max_literal_len = max_literal_len.max(current_literal_len);
199                current_literal_len = 0;
200            }
201            _ => {
202                if !in_char_class {
203                    current_literal_len += 1;
204                }
205            }
206        }
207    }
208    max_literal_len = max_literal_len.max(current_literal_len);
209    max_literal_len >= min_len
210}
211
212fn is_escaped_literal(ch: char) -> bool {
213    matches!(
214        ch,
215        '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
216    )
217}
218
219fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
220    if let Some(ref verify) = spec.verify {
221        // verify.service defaults to the detector's service — empty is fine
222        if !verify.steps.is_empty() {
223            for step in &verify.steps {
224                validate_url(&step.url, issues);
225                check_url_exfil_risk(&step.url, &verify.allowed_domains, issues);
226            }
227        } else if let Some(ref url) = verify.url {
228            validate_url(url, issues);
229            check_url_exfil_risk(url, &verify.allowed_domains, issues);
230        } else {
231            issues.push(QualityIssue::Error(
232                "verify spec has no steps and no default URL".into(),
233            ));
234        }
235        check_oob_consistency(verify, issues);
236    }
237    check_reserved_companion_names(spec, issues);
238}
239
240/// Reserved synthetic companion-map keys used by the OOB interpolator. A
241/// detector that names a companion `__keyhog_oob_*` would either be
242/// shadowed by the OOB injector or shadow it — either way, the verify
243/// templates would resolve to surprising values. Reject the names so a
244/// future detector author gets a clear error instead of a debugging
245/// nightmare.
246const RESERVED_COMPANION_NAMES: &[&str] =
247    &["__keyhog_oob_url", "__keyhog_oob_host", "__keyhog_oob_id"];
248
249fn check_reserved_companion_names(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
250    for (i, c) in spec.companions.iter().enumerate() {
251        if RESERVED_COMPANION_NAMES.contains(&c.name.as_str()) {
252            issues.push(QualityIssue::Error(format!(
253                "companion {} name '{}' is reserved for the OOB interpolator. \
254                 Pick a different name; this collision would corrupt verify templates.",
255                i, c.name,
256            )));
257        }
258    }
259}
260
261/// Check that `[detector.verify.oob]` and `{{interactsh}}` template tokens
262/// are configured consistently:
263///
264/// - `oob` set but no `{{interactsh*}}` token anywhere in the verify
265///   templates → the wait_for parks for nothing; the probe never embeds
266///   the callback URL so the service can't reach our collector.
267/// - `{{interactsh*}}` token present but `oob` unset → the token resolves
268///   to an empty string at runtime, sending malformed requests (e.g.
269///   `https:///x` or a JSON body with `"target":""`).
270///
271/// Both are misconfigurations that load successfully but produce
272/// silently-wrong verify behavior. Fail-closed at the validator instead.
273fn check_oob_consistency(verify: &VerifySpec, issues: &mut Vec<QualityIssue>) {
274    let mut interactsh_referenced = false;
275    let mut scan = |s: &str| {
276        if s.contains("{{interactsh") {
277            interactsh_referenced = true;
278        }
279    };
280    if let Some(ref url) = verify.url {
281        scan(url);
282    }
283    if let Some(ref body) = verify.body {
284        scan(body);
285    }
286    for h in &verify.headers {
287        scan(&h.value);
288    }
289    for step in &verify.steps {
290        scan(&step.url);
291        if let Some(ref body) = step.body {
292            scan(body);
293        }
294        for h in &step.headers {
295            scan(&h.value);
296        }
297    }
298    let oob_configured = verify.oob.is_some();
299    match (oob_configured, interactsh_referenced) {
300        (true, false) => issues.push(QualityIssue::Error(
301            "verify.oob is set but no `{{interactsh}}` / `{{interactsh.host}}` / \
302             `{{interactsh.url}}` / `{{interactsh.id}}` token appears in any verify \
303             template — the OOB callback URL has nowhere to land, so the wait_for \
304             would always time out. Either embed an interactsh token in the body, \
305             URL, or a header — or remove the [detector.verify.oob] block."
306                .into(),
307        )),
308        (false, true) => issues.push(QualityIssue::Error(
309            "an `{{interactsh*}}` token is referenced in a verify template but no \
310             [detector.verify.oob] block is set — the token will resolve to an empty \
311             string at runtime and ship a malformed request to the service. Either \
312             add a [detector.verify.oob] block or remove the token."
313                .into(),
314        )),
315        _ => {}
316    }
317}
318
319/// Catch detectors whose `verify.url` is built from interpolation tokens
320/// without a fixed authoritative host AND without an explicit
321/// `allowed_domains` list. The verifier's runtime domain allowlist
322/// catches these at request time, but flagging at load time gives the
323/// detector author actionable feedback before the rule ships.
324/// kimi-wave3 §1 + §1.HIGH (single-brace `{var}` and `{{shop}}` cases).
325fn check_url_exfil_risk(url: &str, allowed_domains: &[String], issues: &mut Vec<QualityIssue>) {
326    // Detect `{{match}}` or `{{companion.*}}` taking the place of the
327    // authority component of the URL. Conservative match: anything that
328    // starts with the templated host (e.g. `https://{{...}}`, plain
329    // `{{match}}`, `https://{{...}}/path`).
330    let trimmed = url.trim();
331    let after_scheme = trimmed
332        .strip_prefix("https://")
333        .or_else(|| trimmed.strip_prefix("http://"))
334        .unwrap_or(trimmed);
335    let host_starts_with_template =
336        after_scheme.starts_with("{{") || after_scheme.starts_with("{") || trimmed == "{{match}}";
337    if host_starts_with_template && allowed_domains.is_empty() {
338        issues.push(QualityIssue::Error(
339            "verify URL host is templated and no `allowed_domains` is set — \
340             attacker-controlled interpolation could exfil credentials. \
341             Either hardcode the authoritative host in the URL or set \
342             `allowed_domains` explicitly. See kimi-wave3 §1."
343                .into(),
344        ));
345    }
346    // Single-brace `{name}` is a common author error — interpolate.rs
347    // only handles `{{...}}`, so `{name}` lands in the URL literally.
348    if url.contains('{') && !url.contains("{{") {
349        issues.push(QualityIssue::Error(
350            "verify URL uses single-brace `{var}` template syntax which the \
351             interpolator does NOT honor (only `{{var}}` works); the URL will \
352             be sent to a literal-string host. Use `{{companion.var}}`."
353                .into(),
354        ));
355    }
356}
357
358fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
359    if url.is_empty() {
360        issues.push(QualityIssue::Error("verify URL is empty".into()));
361    }
362    if url.starts_with("http://") && !url.contains("localhost") {
363        issues.push(QualityIssue::Warning(
364            "verify URL uses HTTP instead of HTTPS".into(),
365        ));
366    }
367}
368
369fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
370    let mut count = 0;
371    for ch in pattern.chars() {
372        match ch {
373            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
374            _ => count += 1,
375        }
376    }
377    count >= min_len
378}
379
380fn is_pure_character_class(pattern: &str) -> bool {
381    let trimmed = pattern.trim();
382    if !trimmed.starts_with('[') {
383        return false;
384    }
385
386    let Some(close) = trimmed.find(']') else {
387        return false;
388    };
389    let remainder = trimmed[close + 1..].trim();
390    if remainder.is_empty() {
391        return true;
392    }
393    if remainder == "+" || remainder == "*" || remainder == "?" {
394        return true;
395    }
396    if remainder.starts_with('{') {
397        if let Some(qclose) = remainder.find('}') {
398            let after_quantifier = remainder[qclose + 1..].trim();
399            return after_quantifier.is_empty();
400        }
401    }
402
403    false
404}
405
406fn validate_regex_complexity(kind: &str, index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
407    let mut stats = RegexComplexityStats::default();
408    collect_regex_complexity(ast, &mut stats);
409    collect_redos_risks(ast, &mut stats, false);
410
411    if stats.nodes > MAX_REGEX_AST_NODES {
412        issues.push(QualityIssue::Error(format!(
413            "{kind} {index} regex is too complex ({} AST nodes > {} limit)",
414            stats.nodes, MAX_REGEX_AST_NODES
415        )));
416    }
417
418    if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
419        issues.push(QualityIssue::Error(format!(
420            "{kind} {index} regex has too many alternation branches ({} > {} limit)",
421            stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
422        )));
423    }
424
425    if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
426        issues.push(QualityIssue::Error(format!(
427            "{kind} {index} regex has an excessive counted repetition bound ({} > {} limit)",
428            stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
429        )));
430    }
431
432    if stats.has_nested_quantifier {
433        issues.push(QualityIssue::Error(format!(
434            "{kind} {index} regex contains nested quantifiers that can trigger pathological matching"
435        )));
436    }
437
438    if stats.has_quantified_overlapping_alternation {
439        issues.push(QualityIssue::Error(format!(
440            "{kind} {index} regex repeats overlapping alternations; use unambiguous branches instead"
441        )));
442    }
443}
444
445#[derive(Default)]
446struct RegexComplexityStats {
447    nodes: usize,
448    max_alternation_branches: usize,
449    max_repeat_bound: u32,
450    has_nested_quantifier: bool,
451    has_quantified_overlapping_alternation: bool,
452}
453
454fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
455    stats.nodes += 1;
456    match ast {
457        Ast::Repetition(repetition) => {
458            update_repeat_bound(&repetition.op.kind, stats);
459            collect_regex_complexity(&repetition.ast, stats);
460        }
461        Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
462        Ast::Alternation(alternation) => {
463            stats.max_alternation_branches =
464                stats.max_alternation_branches.max(alternation.asts.len());
465            for ast in &alternation.asts {
466                collect_regex_complexity(ast, stats);
467            }
468        }
469        Ast::Concat(concat) => {
470            for ast in &concat.asts {
471                collect_regex_complexity(ast, stats);
472            }
473        }
474        Ast::Empty(_)
475        | Ast::Flags(_)
476        | Ast::Literal(_)
477        | Ast::Dot(_)
478        | Ast::Assertion(_)
479        | Ast::ClassUnicode(_)
480        | Ast::ClassPerl(_)
481        | Ast::ClassBracketed(_) => {}
482    }
483}
484
485fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
486    match ast {
487        Ast::Repetition(repetition) => {
488            // Flag nested quantifiers only when they can cause exponential backtracking.
489            //
490            // SAFE patterns (char class quantifier inside group quantifier):
491            //   (?:api[_\s.-]*)? — [_\s.-]* is atomic, can't overlap
492            //   (?:key|token)[=:\s"']+  — char class quantifier, deterministic
493            //
494            // DANGEROUS patterns (group/concat quantifier inside quantifier):
495            //   (a+)+       — classic ReDoS
496            //   (\w+\s*)+   — overlapping quantifiers on non-atomic elements
497            //
498            // Strategy: only flag when THIS repetition wraps a non-atomic element
499            // AND we're inside another repetition, OR when our inner AST itself
500            // contains a nested repetition wrapping a non-atomic element.
501            let this_is_simple_atom = matches!(
502                &*repetition.ast,
503                Ast::Literal(_)
504                    | Ast::Dot(_)
505                    | Ast::ClassBracketed(_)
506                    | Ast::ClassPerl(_)
507                    | Ast::ClassUnicode(_)
508            );
509            let this_is_unbounded = matches!(
510                repetition.op.kind,
511                ast::RepetitionKind::ZeroOrMore
512                    | ast::RepetitionKind::OneOrMore
513                    | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
514            );
515            // Only flag when BOTH the outer and this repetition are unbounded
516            // and this wraps a non-atomic element. (?:group)? is safe because
517            // ? is {0,1} — it can't cause exponential backtracking.
518            if inside_repetition && !this_is_simple_atom && this_is_unbounded {
519                stats.has_nested_quantifier = true;
520            }
521            if !inside_repetition
522                && this_is_unbounded
523                && !this_is_simple_atom
524                && ast_contains_repetition(&repetition.ast)
525            {
526                stats.has_nested_quantifier = true;
527            }
528            if alternation_has_overlapping_prefixes(&repetition.ast) {
529                stats.has_quantified_overlapping_alternation = true;
530            }
531            // Only propagate inside_repetition when this is unbounded
532            collect_redos_risks(
533                &repetition.ast,
534                stats,
535                inside_repetition || this_is_unbounded,
536            );
537        }
538        Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
539        Ast::Alternation(alternation) => {
540            for ast in &alternation.asts {
541                collect_redos_risks(ast, stats, inside_repetition);
542            }
543        }
544        Ast::Concat(concat) => {
545            for ast in &concat.asts {
546                collect_redos_risks(ast, stats, inside_repetition);
547            }
548        }
549        Ast::Empty(_)
550        | Ast::Flags(_)
551        | Ast::Literal(_)
552        | Ast::Dot(_)
553        | Ast::Assertion(_)
554        | Ast::ClassUnicode(_)
555        | Ast::ClassPerl(_)
556        | Ast::ClassBracketed(_) => {}
557    }
558}
559
560fn ast_contains_repetition(ast: &Ast) -> bool {
561    match ast {
562        Ast::Repetition(_) => true,
563        Ast::Group(group) => ast_contains_repetition(&group.ast),
564        Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
565        Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
566        Ast::Empty(_)
567        | Ast::Flags(_)
568        | Ast::Literal(_)
569        | Ast::Dot(_)
570        | Ast::Assertion(_)
571        | Ast::ClassUnicode(_)
572        | Ast::ClassPerl(_)
573        | Ast::ClassBracketed(_) => false,
574    }
575}
576
577fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
578    let alternatives = match ast {
579        Ast::Alternation(alternation) => &alternation.asts,
580        Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
581        _ => return false,
582    };
583
584    let prefixes = alternatives
585        .iter()
586        .filter_map(literalish_prefix)
587        .collect::<Vec<_>>();
588    for (idx, prefix) in prefixes.iter().enumerate() {
589        for other in prefixes.iter().skip(idx + 1) {
590            if prefix.starts_with(other) || other.starts_with(prefix) {
591                return true;
592            }
593        }
594    }
595    false
596}
597
598fn literalish_prefix(ast: &Ast) -> Option<String> {
599    match ast {
600        Ast::Literal(literal) => Some(literal.c.to_string()),
601        Ast::Concat(concat) => {
602            let mut prefix = String::new();
603            for node in &concat.asts {
604                match node {
605                    Ast::Literal(literal) => prefix.push(literal.c),
606                    Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
607                    _ => break,
608                }
609            }
610            (!prefix.is_empty()).then_some(prefix)
611        }
612        Ast::Group(group) => literalish_prefix(&group.ast),
613        _ => None,
614    }
615}
616
617fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
618    let bound = match kind {
619        ast::RepetitionKind::ZeroOrOne => 1,
620        ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
621        ast::RepetitionKind::Range(range) => match range {
622            ast::RepetitionRange::Exactly(max)
623            | ast::RepetitionRange::AtLeast(max)
624            | ast::RepetitionRange::Bounded(_, max) => *max,
625        },
626    };
627    stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
628}
629
630#[cfg(test)]
631mod oob_validation_tests {
632    use super::*;
633    use crate::spec::load_detectors_from_str;
634
635    fn errors_for(toml_src: &str) -> Vec<String> {
636        let detectors = load_detectors_from_str(toml_src).expect("toml parses");
637        let mut errs = Vec::new();
638        for d in &detectors {
639            for issue in validate_detector(d) {
640                if let QualityIssue::Error(msg) = issue {
641                    errs.push(msg);
642                }
643            }
644        }
645        errs
646    }
647
648    #[test]
649    fn oob_block_without_interactsh_token_is_error() {
650        let toml_src = r#"
651[detector]
652id = "oob-no-token"
653name = "OOB without token"
654service = "github"
655severity = "high"
656keywords = ["GHTOKEN"]
657
658[[detector.patterns]]
659regex = "GHTOKEN_[A-Z0-9]{16}"
660
661[detector.verify]
662method = "POST"
663url = "https://api.github.com/probe"
664body = '{"static":"payload"}'
665
666[detector.verify.oob]
667protocol = "http"
668"#;
669        let errs = errors_for(toml_src);
670        assert!(
671            errs.iter().any(|e| e.contains("verify.oob is set but no")),
672            "expected oob-without-token error; got {errs:?}"
673        );
674    }
675
676    #[test]
677    fn interactsh_token_without_oob_block_is_error() {
678        let toml_src = r#"
679[detector]
680id = "token-no-oob"
681name = "Token without OOB"
682service = "github"
683severity = "high"
684keywords = ["GHTOKEN"]
685
686[[detector.patterns]]
687regex = "GHTOKEN_[A-Z0-9]{16}"
688
689[detector.verify]
690method = "POST"
691url = "https://api.github.com/probe"
692body = '{"target":"https://{{interactsh}}/x"}'
693"#;
694        let errs = errors_for(toml_src);
695        assert!(
696            errs.iter().any(
697                |e| e.contains("token is referenced") && e.contains("no [detector.verify.oob]")
698            ),
699            "expected token-without-oob error; got {errs:?}"
700        );
701    }
702
703    #[test]
704    fn oob_with_interactsh_token_passes() {
705        let toml_src = r#"
706[detector]
707id = "oob-good"
708name = "OOB with token"
709service = "github"
710severity = "high"
711keywords = ["GHTOKEN"]
712
713[[detector.patterns]]
714regex = "GHTOKEN_[A-Z0-9]{16}"
715
716[detector.verify]
717method = "POST"
718url = "https://api.github.com/probe"
719body = '{"target":"https://{{interactsh}}/x"}'
720
721[detector.verify.oob]
722protocol = "http"
723"#;
724        let errs = errors_for(toml_src);
725        let oob_related: Vec<_> = errs
726            .iter()
727            .filter(|e| e.contains("oob") || e.contains("interactsh"))
728            .collect();
729        assert!(
730            oob_related.is_empty(),
731            "unexpected OOB errors: {oob_related:?}"
732        );
733    }
734
735    #[test]
736    fn reserved_companion_name_is_error() {
737        let toml_src = r#"
738[detector]
739id = "reserved-name"
740name = "Reserved name collision"
741service = "github"
742severity = "high"
743keywords = ["GHTOKEN"]
744
745[[detector.patterns]]
746regex = "GHTOKEN_[A-Z0-9]{16}"
747
748[[detector.companions]]
749name = "__keyhog_oob_url"
750regex = "(?:URL=)([a-z]{4,})"
751within_lines = 5
752"#;
753        let errs = errors_for(toml_src);
754        assert!(
755            errs.iter()
756                .any(|e| e.contains("__keyhog_oob_url") && e.contains("reserved")),
757            "expected reserved-name error; got {errs:?}"
758        );
759    }
760
761    /// Companions that are referenced via `{{companion.X}}` in a verify
762    /// template (URL / body / header / step) but whose regex contains a
763    /// context anchor (`KEY=value` style) with NO parenthesized capture
764    /// group will substitute the FULL anchor + value into the verify
765    /// template — typically corrupting the resulting request.
766    ///
767    /// `CompiledCompanion` auto-detects the first capture group when the
768    /// regex has any (`compiler.rs:369`); without a group, the whole
769    /// match is the value. So `(?:KEY=)([a-z]+)` is fine (group resolves
770    /// to `[a-z]+`), but `KEY=[a-z]+` substitutes the literal `KEY=` too.
771    ///
772    /// This audit walks the embedded corpus, identifies suspicious
773    /// companions, and asserts none exist. Any new detector whose
774    /// companion is anchored-but-not-grouped will trip this test.
775    #[test]
776    fn audit_companion_substitutions_have_capture_groups() {
777        use crate::spec::load_detectors_from_str;
778        let mut suspicious = Vec::new();
779        for (filename, toml_src) in crate::embedded_detector_tomls() {
780            let Ok(detectors) = load_detectors_from_str(toml_src) else {
781                continue;
782            };
783            for d in &detectors {
784                let Some(verify) = d.verify.as_ref() else {
785                    continue;
786                };
787                // Build the set of companion names referenced via
788                // `{{companion.X}}` in any verify template.
789                let mut substituted: std::collections::HashSet<String> =
790                    std::collections::HashSet::new();
791                let mut scan = |s: &str| {
792                    let mut rest = s;
793                    while let Some(start) = rest.find("{{companion.") {
794                        let after = &rest[start + "{{companion.".len()..];
795                        if let Some(end) = after.find("}}") {
796                            substituted.insert(after[..end].to_string());
797                            rest = &after[end + 2..];
798                        } else {
799                            break;
800                        }
801                    }
802                };
803                if let Some(ref u) = verify.url {
804                    scan(u);
805                }
806                if let Some(ref b) = verify.body {
807                    scan(b);
808                }
809                for h in &verify.headers {
810                    scan(&h.value);
811                }
812                for step in &verify.steps {
813                    scan(&step.url);
814                    if let Some(ref b) = step.body {
815                        scan(b);
816                    }
817                    for h in &step.headers {
818                        scan(&h.value);
819                    }
820                    if let crate::AuthSpec::Header { template, .. } = &step.auth {
821                        scan(template);
822                    }
823                }
824                if let Some(crate::AuthSpec::Header { template, .. }) = &verify.auth {
825                    scan(template);
826                }
827
828                for c in &d.companions {
829                    if !substituted.contains(&c.name) {
830                        continue;
831                    }
832                    // The companion's value will be substituted somewhere.
833                    // If the regex has any unescaped `(`, regex auto-detects
834                    // a capture group → fine. Otherwise check that the
835                    // regex doesn't contain a context anchor that would
836                    // bleed into the substitution.
837                    let has_group = regex_has_capture_group(&c.regex);
838                    if has_group {
839                        continue;
840                    }
841                    // No group → entire match substitutes. Look for assignment
842                    // markers `=` or `:` outside character classes — these
843                    // indicate the regex anchors on `KEY=value` and the
844                    // substitution would include the prefix.
845                    if regex_likely_includes_anchor_prefix(&c.regex) {
846                        suspicious.push(format!(
847                            "{} (companion {} regex {:?})",
848                            filename, c.name, c.regex
849                        ));
850                    }
851                }
852            }
853        }
854        assert!(
855            suspicious.is_empty(),
856            "companions referenced in verify substitutions but lacking a capture group \
857             on a context-anchored regex (would substitute `KEY=value` instead of just \
858             `value`):\n  {}",
859            suspicious.join("\n  ")
860        );
861    }
862
863    /// Cheap heuristic: returns true if the regex has any unescaped `(`
864    /// outside a character class. Matches both capturing `(...)` and
865    /// non-capturing `(?:...)` — but the auto-detect on the scanner side
866    /// (`regex.captures_len() > 1`) only fires for capturing groups, so
867    /// we want to be more precise. This walker tracks `(?:` / `(?i:` /
868    /// `(?P<...>` etc. and only counts groups that produce a capture.
869    fn regex_has_capture_group(pattern: &str) -> bool {
870        let bytes = pattern.as_bytes();
871        let mut i = 0;
872        let mut in_class = false;
873        let mut escape = false;
874        while i < bytes.len() {
875            let b = bytes[i];
876            if escape {
877                escape = false;
878                i += 1;
879                continue;
880            }
881            match b {
882                b'\\' => {
883                    escape = true;
884                }
885                b'[' if !in_class => {
886                    in_class = true;
887                }
888                b']' if in_class => {
889                    in_class = false;
890                }
891                b'(' if !in_class => {
892                    // Distinguish (?: / (?i: / (?P<name>...) / (?<name>...) / (...)
893                    if i + 1 < bytes.len() && bytes[i + 1] == b'?' {
894                        // (?...) — non-capturing OR named group OR
895                        // look-around assertion. Distinguish them:
896                        //   (?P<name>...)  capturing (Rust + RE2 style)
897                        //   (?<name>...)   capturing (PCRE + .NET style),
898                        //                  but `(?<=` and `(?<!` are
899                        //                  zero-width look-behinds — NOT
900                        //                  capturing.
901                        //   (?:...)        non-capturing
902                        //   (?i:...) etc.  non-capturing flag groups
903                        //   (?=...) (?!...) zero-width look-around
904                        let after = &bytes[i + 2..];
905                        if after.starts_with(b"P<") {
906                            return true;
907                        }
908                        if after.starts_with(b"<") {
909                            // Disambiguate look-behind from named group.
910                            // `(?<=...)` and `(?<!...)` start with `<=`/`<!`;
911                            // anything else after `<` is a name.
912                            if after.starts_with(b"<=") || after.starts_with(b"<!") {
913                                // look-behind, non-capturing
914                            } else {
915                                return true;
916                            }
917                        }
918                        // Otherwise `(?:`, `(?i:)`, `(?=...)`, `(?!...)`,
919                        // bare flags `(?i)`, etc. — all non-capturing.
920                    } else {
921                        return true; // Plain `(` = capturing group
922                    }
923                }
924                _ => {}
925            }
926            i += 1;
927        }
928        false
929    }
930
931    /// Returns true if the regex has an assignment marker `=` outside any
932    /// character class. URL companions (the common no-capture-group case
933    /// that's actually fine) typically don't contain `=` — only their
934    /// query strings would, and matching query-string values via
935    /// companion is rare. `=` outside character classes is a strong
936    /// signal that the regex anchors on `KEY=value` and would bleed the
937    /// `KEY=` prefix into the substitution.
938    ///
939    /// `:` is intentionally NOT flagged: it appears in URL schemes
940    /// (`https://`) and would generate false positives on every URL-
941    /// shaped companion regex.
942    fn regex_likely_includes_anchor_prefix(pattern: &str) -> bool {
943        let bytes = pattern.as_bytes();
944        let mut i = 0;
945        let mut in_class = false;
946        let mut escape = false;
947        while i < bytes.len() {
948            let b = bytes[i];
949            if escape {
950                escape = false;
951                i += 1;
952                continue;
953            }
954            match b {
955                b'\\' => {
956                    escape = true;
957                }
958                b'[' if !in_class => {
959                    in_class = true;
960                }
961                b']' if in_class => {
962                    in_class = false;
963                }
964                b'=' if !in_class => return true,
965                _ => {}
966            }
967            i += 1;
968        }
969        false
970    }
971
972    /// Audit every detector's auth-field references (Bearer.field,
973    /// Basic.username, Basic.password, Query.field, AwsV4.access_key/
974    /// secret_key/session_token) and assert each one resolves to either:
975    ///   - a literal value (anything that isn't `match`, `companion`, or
976    ///     `{{...}}`),
977    ///   - the special `match` token,
978    ///   - or `companion.<name>` where `<name>` actually exists in the
979    ///     detector's companions list.
980    ///
981    /// The `resolve_field` helper falls through to "literal string" for
982    /// anything that doesn't match those exact shapes, so a typo like
983    /// `companion` (no `.name`), `companion.<typo>`, or `{{match}}`
984    /// (template syntax in a field-style slot) used to silently produce
985    /// a request that authenticated as the literal string. This audit
986    /// rejects those at validation time.
987    #[test]
988    fn audit_auth_field_references_resolve() {
989        use crate::spec::load_detectors_from_str;
990        use crate::AuthSpec;
991
992        let mut errors: Vec<String> = Vec::new();
993        for (filename, toml_src) in crate::embedded_detector_tomls() {
994            let Ok(detectors) = load_detectors_from_str(toml_src) else {
995                continue;
996            };
997            for d in &detectors {
998                let companion_names: std::collections::HashSet<&str> =
999                    d.companions.iter().map(|c| c.name.as_str()).collect();
1000
1001                let check = |label: &str, field: &str| -> Option<String> {
1002                    if field.contains("{{") {
1003                        return Some(format!(
1004                            "{filename}: {label} field {field:?} contains `{{...}}` template — \
1005                             field-style slots use `match`/`companion.<name>`/literal, NOT `{{...}}`. \
1006                             It silently resolves to the literal string."
1007                        ));
1008                    }
1009                    if field == "companion" {
1010                        return Some(format!(
1011                            "{filename}: {label} field is bare `\"companion\"` with no \
1012                             `.<name>` — silently resolves to the literal string \"companion\"."
1013                        ));
1014                    }
1015                    if let Some(name) = field.strip_prefix("companion.") {
1016                        if !companion_names.contains(name) {
1017                            return Some(format!(
1018                                "{filename}: {label} field {field:?} references companion \
1019                                 {name:?} which is not declared on this detector."
1020                            ));
1021                        }
1022                    }
1023                    None
1024                };
1025
1026                if let Some(verify) = d.verify.as_ref() {
1027                    let mut audit_auth = |auth: &AuthSpec, ctx: &str| {
1028                        match auth {
1029                            AuthSpec::Bearer { field } => {
1030                                if let Some(e) = check(&format!("{ctx} bearer.field"), field) {
1031                                    errors.push(e);
1032                                }
1033                            }
1034                            AuthSpec::Basic { username, password } => {
1035                                if let Some(e) = check(&format!("{ctx} basic.username"), username) {
1036                                    errors.push(e);
1037                                }
1038                                if let Some(e) = check(&format!("{ctx} basic.password"), password) {
1039                                    errors.push(e);
1040                                }
1041                            }
1042                            AuthSpec::Query { field, .. } => {
1043                                if let Some(e) = check(&format!("{ctx} query.field"), field) {
1044                                    errors.push(e);
1045                                }
1046                            }
1047                            AuthSpec::AwsV4 {
1048                                access_key,
1049                                secret_key,
1050                                session_token,
1051                                ..
1052                            } => {
1053                                if let Some(e) =
1054                                    check(&format!("{ctx} awsv4.access_key"), access_key)
1055                                {
1056                                    errors.push(e);
1057                                }
1058                                if let Some(e) =
1059                                    check(&format!("{ctx} awsv4.secret_key"), secret_key)
1060                                {
1061                                    errors.push(e);
1062                                }
1063                                if let Some(tok) = session_token {
1064                                    if let Some(e) =
1065                                        check(&format!("{ctx} awsv4.session_token"), tok)
1066                                    {
1067                                        errors.push(e);
1068                                    }
1069                                }
1070                            }
1071                            // Header.template is a TEMPLATE (uses interpolate
1072                            // which honors `{{match}}` / `{{companion.X}}`),
1073                            // not a field — different validation path.
1074                            AuthSpec::Header { .. } | AuthSpec::None | AuthSpec::Script { .. } => {}
1075                        }
1076                    };
1077                    if let Some(ref auth) = verify.auth {
1078                        audit_auth(auth, "verify.auth");
1079                    }
1080                    for (i, step) in verify.steps.iter().enumerate() {
1081                        audit_auth(&step.auth, &format!("verify.steps[{i}].auth"));
1082                    }
1083                }
1084            }
1085        }
1086        assert!(
1087            errors.is_empty(),
1088            "auth field reference audit found broken detectors:\n  {}",
1089            errors.join("\n  ")
1090        );
1091    }
1092
1093    #[test]
1094    fn interactsh_token_in_header_value_counts() {
1095        // The token can live in the body, URL, OR a header value — any one
1096        // satisfies the "interactsh referenced" check.
1097        let toml_src = r#"
1098[detector]
1099id = "header-oob"
1100name = "OOB via header"
1101service = "github"
1102severity = "high"
1103keywords = ["GHTOKEN"]
1104
1105[[detector.patterns]]
1106regex = "GHTOKEN_[A-Z0-9]{16}"
1107
1108[detector.verify]
1109method = "POST"
1110url = "https://api.github.com/probe"
1111
1112[[detector.verify.headers]]
1113name = "X-Callback"
1114value = "https://{{interactsh}}/x"
1115
1116[detector.verify.oob]
1117protocol = "http"
1118"#;
1119        let errs = errors_for(toml_src);
1120        let oob_related: Vec<_> = errs
1121            .iter()
1122            .filter(|e| e.contains("oob") || e.contains("interactsh"))
1123            .collect();
1124        assert!(
1125            oob_related.is_empty(),
1126            "header-token detection failed: {oob_related:?}"
1127        );
1128    }
1129}
keyhog_core/spec/validate.rs

keyhog_core/spec/
validate.rs