Skip to main content

keyhog_core/spec/
validate.rs

1//! Detector quality gate validation rules used while loading TOML specs.
2
3use super::{DetectorSpec, VerifySpec};
4use regex_syntax::ast;
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8// MAX_REGEX_AST_NODES / MAX_REGEX_ALTERNATION_BRANCHES /
9// MAX_REGEX_REPEAT_BOUND were originally defined here too but are the
10// canonical constants in `validate_regex.rs` (which is where they're
11// actually consumed). Duplicates here had no consumers - clippy
12// `dead_code` flagged them. Re-imports happen via the `use
13// validate_regex::validate_regex_complexity;` below.
14
15/// Quality issue found in a detector spec.
16///
17/// # Examples
18///
19/// ```rust
20/// use keyhog_core::QualityIssue;
21///
22/// let issue = QualityIssue::Warning("add keywords".into());
23/// assert!(matches!(issue, QualityIssue::Warning(_)));
24/// ```
25#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
26pub enum QualityIssue {
27    Error(String),
28    Warning(String),
29}
30
31/// Validate a detector spec against the quality gate.
32///
33/// # Examples
34///
35/// ```rust
36/// use keyhog_core::{DetectorSpec, PatternSpec, Severity, validate_detector};
37///
38/// let detector = DetectorSpec { tests: Vec::new(),
39///     id: "demo".into(),
40///     name: "Demo".into(),
41///     service: "demo".into(),
42///     severity: Severity::High,
43///     patterns: vec![PatternSpec {
44///         regex: "demo_[A-Z0-9]{8}".into(),
45///         ..Default::default()
46///     }],
47///     companions: Vec::new(),
48///     verify: None,
49///     keywords: vec!["demo_".into()],
50///     min_confidence: None,
51/// };
52///
53/// assert!(validate_detector(&detector).is_empty());
54/// ```
55pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
56    let mut issues = Vec::new();
57    validate_patterns_present(spec, &mut issues);
58    validate_regexes(spec, &mut issues);
59    validate_keywords(spec, &mut issues);
60    validate_pattern_specificity(spec, &mut issues);
61    validate_companions(spec, &mut issues);
62    validate_verify_spec(spec, &mut issues);
63    issues
64}
65
66fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
67    if spec.patterns.is_empty() {
68        issues.push(QualityIssue::Error("no patterns defined".into()));
69    }
70}
71
72fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
73    for (i, pat) in spec.patterns.iter().enumerate() {
74        validate_regex_definition("pattern", i, &pat.regex, issues);
75    }
76}
77
78fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
79    if spec.keywords.is_empty() {
80        issues.push(QualityIssue::Warning(
81            "no keywords defined - pattern may produce false positives".into(),
82        ));
83    }
84}
85
86fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
87    for (i, pat) in spec.patterns.iter().enumerate() {
88        let has_prefix = has_literal_prefix(&pat.regex, 3);
89        let has_group = pat.group.is_some();
90        let is_pure_charclass = is_pure_character_class(&pat.regex);
91
92        if is_pure_charclass && !has_group {
93            issues.push(QualityIssue::Error(format!(
94                "pattern {} is a pure character class ({}) - too broad without context anchoring. \
95                 Use a capture group or add a literal prefix.",
96                i, pat.regex
97            )));
98        } else if !has_prefix && !has_group && spec.keywords.is_empty() {
99            issues.push(QualityIssue::Warning(format!(
100                "pattern {} has no literal prefix and no capture group - may false-positive",
101                i
102            )));
103        }
104    }
105}
106
107fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
108    for (i, companion) in spec.companions.iter().enumerate() {
109        if companion.name.trim().is_empty() {
110            issues.push(QualityIssue::Error(format!(
111                "companion {} name must not be empty",
112                i
113            )));
114        }
115        validate_regex_definition("companion", i, &companion.regex, issues);
116        // A "pure character class" companion (e.g. `[A-Z0-9]{10}` for an
117        // Algolia application_id) is acceptable when `within_lines` is small:
118        // the positional constraint is itself the contextual anchor. Reject
119        // only when the companion permits a wide search radius - at that
120        // point the lack of textual context really does over-fire.
121        if is_pure_character_class(&companion.regex) {
122            if companion.within_lines <= TIGHT_COMPANION_RADIUS {
123                issues.push(QualityIssue::Warning(format!(
124                    "companion {} regex '{}' is a pure character class; \
125                     allowed because within_lines={} ≤ {} (positional anchoring).",
126                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
127                )));
128            } else {
129                issues.push(QualityIssue::Error(format!(
130                    "companion {} regex '{}' is a pure character class with within_lines={} \
131                     (> {}) - the wide search radius needs a literal context anchor",
132                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
133                )));
134            }
135        } else if !has_substantial_literal(&companion.regex, 3) {
136            issues.push(QualityIssue::Warning(format!(
137                "companion {} regex '{}' is too broad - may produce false positives. \
138                 Add a context anchor like 'KEY_NAME='.",
139                i, companion.regex
140            )));
141        }
142    }
143}
144
145/// Companion search radius (in lines) below which a pure character-class
146/// regex is acceptable. The positional bound provides the context anchor.
147const TIGHT_COMPANION_RADIUS: usize = 5;
148
149fn validate_regex_definition(
150    kind: &str,
151    index: usize,
152    regex: &str,
153    issues: &mut Vec<QualityIssue>,
154) {
155    if regex.len() > MAX_REGEX_PATTERN_LEN {
156        issues.push(QualityIssue::Error(format!(
157            "{kind} {index} regex is too large ({} bytes > {} byte limit)",
158            regex.len(),
159            MAX_REGEX_PATTERN_LEN
160        )));
161        return;
162    }
163
164    match ast::parse::Parser::new().parse(regex) {
165        Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
166        Err(error) => issues.push(QualityIssue::Error(format!(
167            "{kind} {index} regex does not compile: {error}"
168        ))),
169    }
170}
171
172fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
173    let mut max_literal_len = 0;
174    let mut current_literal_len = 0;
175    let mut in_escape = false;
176    let mut in_char_class = false;
177
178    for ch in pattern.chars() {
179        if in_escape {
180            if is_escaped_literal(ch) {
181                current_literal_len += 1;
182            } else {
183                max_literal_len = max_literal_len.max(current_literal_len);
184                current_literal_len = 0;
185            }
186            in_escape = false;
187            continue;
188        }
189
190        match ch {
191            '\\' => in_escape = true,
192            '[' => {
193                max_literal_len = max_literal_len.max(current_literal_len);
194                current_literal_len = 0;
195                in_char_class = true;
196            }
197            ']' => {
198                in_char_class = false;
199            }
200            '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
201                max_literal_len = max_literal_len.max(current_literal_len);
202                current_literal_len = 0;
203            }
204            _ => {
205                if !in_char_class {
206                    current_literal_len += 1;
207                }
208            }
209        }
210    }
211    max_literal_len = max_literal_len.max(current_literal_len);
212    max_literal_len >= min_len
213}
214
215fn is_escaped_literal(ch: char) -> bool {
216    matches!(
217        ch,
218        '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
219    )
220}
221
222fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
223    if let Some(ref verify) = spec.verify {
224        // verify.service defaults to the detector's service - empty is fine
225        if !verify.steps.is_empty() {
226            for step in &verify.steps {
227                validate_url(&step.url, issues);
228                check_url_exfil_risk(&step.url, &verify.allowed_domains, issues);
229            }
230        } else if let Some(ref url) = verify.url {
231            validate_url(url, issues);
232            check_url_exfil_risk(url, &verify.allowed_domains, issues);
233        } else {
234            issues.push(QualityIssue::Error(
235                "verify spec has no steps and no default URL".into(),
236            ));
237        }
238        check_oob_consistency(verify, issues);
239    }
240    check_reserved_companion_names(spec, issues);
241}
242
243/// Reserved synthetic companion-map keys used by the OOB interpolator. A
244/// detector that names a companion `__keyhog_oob_*` would either be
245/// shadowed by the OOB injector or shadow it - either way, the verify
246/// templates would resolve to surprising values. Reject the names so a
247/// future detector author gets a clear error instead of a debugging
248/// nightmare.
249const RESERVED_COMPANION_NAMES: &[&str] =
250    &["__keyhog_oob_url", "__keyhog_oob_host", "__keyhog_oob_id"];
251
252fn check_reserved_companion_names(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
253    for (i, c) in spec.companions.iter().enumerate() {
254        if RESERVED_COMPANION_NAMES.contains(&c.name.as_str()) {
255            issues.push(QualityIssue::Error(format!(
256                "companion {} name '{}' is reserved for the OOB interpolator. \
257                 Pick a different name; this collision would corrupt verify templates.",
258                i, c.name,
259            )));
260        }
261    }
262}
263
264/// Check that `[detector.verify.oob]` and `{{interactsh}}` template tokens
265/// are configured consistently:
266///
267/// - `oob` set but no `{{interactsh*}}` token anywhere in the verify
268///   templates → the wait_for parks for nothing; the probe never embeds
269///   the callback URL so the service can't reach our collector.
270/// - `{{interactsh*}}` token present but `oob` unset → the token resolves
271///   to an empty string at runtime, sending malformed requests (e.g.
272///   `https:///x` or a JSON body with `"target":""`).
273///
274/// Both are misconfigurations that load successfully but produce
275/// silently-wrong verify behavior. Fail-closed at the validator instead.
276fn check_oob_consistency(verify: &VerifySpec, issues: &mut Vec<QualityIssue>) {
277    let mut interactsh_referenced = false;
278    let mut scan = |s: &str| {
279        if s.contains("{{interactsh") {
280            interactsh_referenced = true;
281        }
282    };
283    if let Some(ref url) = verify.url {
284        scan(url);
285    }
286    if let Some(ref body) = verify.body {
287        scan(body);
288    }
289    for h in &verify.headers {
290        scan(&h.value);
291    }
292    for step in &verify.steps {
293        scan(&step.url);
294        if let Some(ref body) = step.body {
295            scan(body);
296        }
297        for h in &step.headers {
298            scan(&h.value);
299        }
300    }
301    let oob_configured = verify.oob.is_some();
302    match (oob_configured, interactsh_referenced) {
303        (true, false) => issues.push(QualityIssue::Error(
304            "verify.oob is set but no `{{interactsh}}` / `{{interactsh.host}}` / \
305             `{{interactsh.url}}` / `{{interactsh.id}}` token appears in any verify \
306             template - the OOB callback URL has nowhere to land, so the wait_for \
307             would always time out. Either embed an interactsh token in the body, \
308             URL, or a header - or remove the [detector.verify.oob] block."
309                .into(),
310        )),
311        (false, true) => issues.push(QualityIssue::Error(
312            "an `{{interactsh*}}` token is referenced in a verify template but no \
313             [detector.verify.oob] block is set - the token will resolve to an empty \
314             string at runtime and ship a malformed request to the service. Either \
315             add a [detector.verify.oob] block or remove the token."
316                .into(),
317        )),
318        _ => {}
319    }
320}
321
322/// Catch detectors whose `verify.url` is built from interpolation tokens
323/// without a fixed authoritative host AND without an explicit
324/// `allowed_domains` list. The verifier's runtime domain allowlist
325/// catches these at request time, but flagging at load time gives the
326/// detector author actionable feedback before the rule ships.
327/// kimi-wave3 §1 + §1.HIGH (single-brace `{var}` and `{{shop}}` cases).
328fn check_url_exfil_risk(url: &str, allowed_domains: &[String], issues: &mut Vec<QualityIssue>) {
329    // Detect `{{match}}` or `{{companion.*}}` taking the place of the
330    // authority component of the URL. Conservative match: anything that
331    // starts with the templated host (e.g. `https://{{...}}`, plain
332    // `{{match}}`, `https://{{...}}/path`).
333    let trimmed = url.trim();
334    let after_scheme = trimmed
335        .strip_prefix("https://")
336        .or_else(|| trimmed.strip_prefix("http://"))
337        .unwrap_or(trimmed);
338    let host_starts_with_template =
339        after_scheme.starts_with("{{") || after_scheme.starts_with("{") || trimmed == "{{match}}";
340    if host_starts_with_template && allowed_domains.is_empty() {
341        issues.push(QualityIssue::Error(
342            "verify URL host is templated and no `allowed_domains` is set - \
343             attacker-controlled interpolation could exfil credentials. \
344             Either hardcode the authoritative host in the URL or set \
345             `allowed_domains` explicitly. See kimi-wave3 §1."
346                .into(),
347        ));
348    }
349    // Single-brace `{name}` is a common author error - interpolate.rs
350    // only handles `{{...}}`, so `{name}` lands in the URL literally.
351    if url.contains('{') && !url.contains("{{") {
352        issues.push(QualityIssue::Error(
353            "verify URL uses single-brace `{var}` template syntax which the \
354             interpolator does NOT honor (only `{{var}}` works); the URL will \
355             be sent to a literal-string host. Use `{{companion.var}}`."
356                .into(),
357        ));
358    }
359}
360
361fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
362    if url.is_empty() {
363        issues.push(QualityIssue::Error("verify URL is empty".into()));
364    }
365    if url.starts_with("http://") && !url.contains("localhost") {
366        issues.push(QualityIssue::Warning(
367            "verify URL uses HTTP instead of HTTPS".into(),
368        ));
369    }
370}
371
372fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
373    let mut count = 0;
374    for ch in pattern.chars() {
375        match ch {
376            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
377            _ => count += 1,
378        }
379    }
380    count >= min_len
381}
382
383fn is_pure_character_class(pattern: &str) -> bool {
384    let trimmed = pattern.trim();
385    if !trimmed.starts_with('[') {
386        return false;
387    }
388
389    let Some(close) = trimmed.find(']') else {
390        return false;
391    };
392    let remainder = trimmed[close + 1..].trim();
393    if remainder.is_empty() {
394        return true;
395    }
396    if remainder == "+" || remainder == "*" || remainder == "?" {
397        return true;
398    }
399    if remainder.starts_with('{') {
400        if let Some(qclose) = remainder.find('}') {
401            let after_quantifier = remainder[qclose + 1..].trim();
402            return after_quantifier.is_empty();
403        }
404    }
405
406    false
407}
408
409#[path = "validate_regex.rs"]
410mod validate_regex;
411use validate_regex::validate_regex_complexity;