Skip to main content

keyhog_core/spec/
validate.rs

1//! Detector quality gate validation rules used while loading TOML specs.
2
3use super::{DetectorSpec, VerifySpec};
4use regex_syntax::ast;
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8// MAX_REGEX_AST_NODES / MAX_REGEX_ALTERNATION_BRANCHES /
9// MAX_REGEX_REPEAT_BOUND were originally defined here too but are the
10// canonical constants in `validate_regex.rs` (which is where they're
11// actually consumed). Duplicates here had no consumers - clippy
12// `dead_code` flagged them. Re-imports happen via the `use
13// validate_regex::validate_regex_complexity;` below.
14
15/// Quality issue found in a detector spec.
16///
17/// # Examples
18///
19/// ```rust
20/// use keyhog_core::QualityIssue;
21///
22/// let issue = QualityIssue::Warning("add keywords".into());
23/// assert!(matches!(issue, QualityIssue::Warning(_)));
24/// ```
25#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
26pub enum QualityIssue {
27    Error(String),
28    Warning(String),
29}
30
31/// Validate a detector spec against the quality gate.
32///
33/// # Examples
34///
35/// ```rust
36/// use keyhog_core::{DetectorSpec, PatternSpec, Severity, validate_detector};
37///
38/// let detector = DetectorSpec {
39///     id: "demo".into(),
40///     name: "Demo".into(),
41///     service: "demo".into(),
42///     severity: Severity::High,
43///     patterns: vec![PatternSpec {
44///         regex: "demo_[A-Z0-9]{8}".into(),
45///         ..Default::default()
46///     }],
47///     companions: Vec::new(),
48///     verify: None,
49///     keywords: vec!["demo_".into()],
50/// };
51///
52/// assert!(validate_detector(&detector).is_empty());
53/// ```
54pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
55    let mut issues = Vec::new();
56    validate_patterns_present(spec, &mut issues);
57    validate_regexes(spec, &mut issues);
58    validate_keywords(spec, &mut issues);
59    validate_pattern_specificity(spec, &mut issues);
60    validate_companions(spec, &mut issues);
61    validate_verify_spec(spec, &mut issues);
62    issues
63}
64
65fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
66    if spec.patterns.is_empty() {
67        issues.push(QualityIssue::Error("no patterns defined".into()));
68    }
69}
70
71fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
72    for (i, pat) in spec.patterns.iter().enumerate() {
73        validate_regex_definition("pattern", i, &pat.regex, issues);
74    }
75}
76
77fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
78    if spec.keywords.is_empty() {
79        issues.push(QualityIssue::Warning(
80            "no keywords defined - pattern may produce false positives".into(),
81        ));
82    }
83}
84
85fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
86    for (i, pat) in spec.patterns.iter().enumerate() {
87        let has_prefix = has_literal_prefix(&pat.regex, 3);
88        let has_group = pat.group.is_some();
89        let is_pure_charclass = is_pure_character_class(&pat.regex);
90
91        if is_pure_charclass && !has_group {
92            issues.push(QualityIssue::Error(format!(
93                "pattern {} is a pure character class ({}) - too broad without context anchoring. \
94                 Use a capture group or add a literal prefix.",
95                i, pat.regex
96            )));
97        } else if !has_prefix && !has_group && spec.keywords.is_empty() {
98            issues.push(QualityIssue::Warning(format!(
99                "pattern {} has no literal prefix and no capture group - may false-positive",
100                i
101            )));
102        }
103    }
104}
105
106fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
107    for (i, companion) in spec.companions.iter().enumerate() {
108        if companion.name.trim().is_empty() {
109            issues.push(QualityIssue::Error(format!(
110                "companion {} name must not be empty",
111                i
112            )));
113        }
114        validate_regex_definition("companion", i, &companion.regex, issues);
115        // A "pure character class" companion (e.g. `[A-Z0-9]{10}` for an
116        // Algolia application_id) is acceptable when `within_lines` is small:
117        // the positional constraint is itself the contextual anchor. Reject
118        // only when the companion permits a wide search radius - at that
119        // point the lack of textual context really does over-fire.
120        if is_pure_character_class(&companion.regex) {
121            if companion.within_lines <= TIGHT_COMPANION_RADIUS {
122                issues.push(QualityIssue::Warning(format!(
123                    "companion {} regex '{}' is a pure character class; \
124                     allowed because within_lines={} ≤ {} (positional anchoring).",
125                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
126                )));
127            } else {
128                issues.push(QualityIssue::Error(format!(
129                    "companion {} regex '{}' is a pure character class with within_lines={} \
130                     (> {}) - the wide search radius needs a literal context anchor",
131                    i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
132                )));
133            }
134        } else if !has_substantial_literal(&companion.regex, 3) {
135            issues.push(QualityIssue::Warning(format!(
136                "companion {} regex '{}' is too broad - may produce false positives. \
137                 Add a context anchor like 'KEY_NAME='.",
138                i, companion.regex
139            )));
140        }
141    }
142}
143
144/// Companion search radius (in lines) below which a pure character-class
145/// regex is acceptable. The positional bound provides the context anchor.
146const TIGHT_COMPANION_RADIUS: usize = 5;
147
148fn validate_regex_definition(
149    kind: &str,
150    index: usize,
151    regex: &str,
152    issues: &mut Vec<QualityIssue>,
153) {
154    if regex.len() > MAX_REGEX_PATTERN_LEN {
155        issues.push(QualityIssue::Error(format!(
156            "{kind} {index} regex is too large ({} bytes > {} byte limit)",
157            regex.len(),
158            MAX_REGEX_PATTERN_LEN
159        )));
160        return;
161    }
162
163    match ast::parse::Parser::new().parse(regex) {
164        Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
165        Err(error) => issues.push(QualityIssue::Error(format!(
166            "{kind} {index} regex does not compile: {error}"
167        ))),
168    }
169}
170
171fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
172    let mut max_literal_len = 0;
173    let mut current_literal_len = 0;
174    let mut in_escape = false;
175    let mut in_char_class = false;
176
177    for ch in pattern.chars() {
178        if in_escape {
179            if is_escaped_literal(ch) {
180                current_literal_len += 1;
181            } else {
182                max_literal_len = max_literal_len.max(current_literal_len);
183                current_literal_len = 0;
184            }
185            in_escape = false;
186            continue;
187        }
188
189        match ch {
190            '\\' => in_escape = true,
191            '[' => {
192                max_literal_len = max_literal_len.max(current_literal_len);
193                current_literal_len = 0;
194                in_char_class = true;
195            }
196            ']' => {
197                in_char_class = false;
198            }
199            '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
200                max_literal_len = max_literal_len.max(current_literal_len);
201                current_literal_len = 0;
202            }
203            _ => {
204                if !in_char_class {
205                    current_literal_len += 1;
206                }
207            }
208        }
209    }
210    max_literal_len = max_literal_len.max(current_literal_len);
211    max_literal_len >= min_len
212}
213
214fn is_escaped_literal(ch: char) -> bool {
215    matches!(
216        ch,
217        '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
218    )
219}
220
221fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
222    if let Some(ref verify) = spec.verify {
223        // verify.service defaults to the detector's service - empty is fine
224        if !verify.steps.is_empty() {
225            for step in &verify.steps {
226                validate_url(&step.url, issues);
227                check_url_exfil_risk(&step.url, &verify.allowed_domains, issues);
228            }
229        } else if let Some(ref url) = verify.url {
230            validate_url(url, issues);
231            check_url_exfil_risk(url, &verify.allowed_domains, issues);
232        } else {
233            issues.push(QualityIssue::Error(
234                "verify spec has no steps and no default URL".into(),
235            ));
236        }
237        check_oob_consistency(verify, issues);
238    }
239    check_reserved_companion_names(spec, issues);
240}
241
242/// Reserved synthetic companion-map keys used by the OOB interpolator. A
243/// detector that names a companion `__keyhog_oob_*` would either be
244/// shadowed by the OOB injector or shadow it - either way, the verify
245/// templates would resolve to surprising values. Reject the names so a
246/// future detector author gets a clear error instead of a debugging
247/// nightmare.
248const RESERVED_COMPANION_NAMES: &[&str] =
249    &["__keyhog_oob_url", "__keyhog_oob_host", "__keyhog_oob_id"];
250
251fn check_reserved_companion_names(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
252    for (i, c) in spec.companions.iter().enumerate() {
253        if RESERVED_COMPANION_NAMES.contains(&c.name.as_str()) {
254            issues.push(QualityIssue::Error(format!(
255                "companion {} name '{}' is reserved for the OOB interpolator. \
256                 Pick a different name; this collision would corrupt verify templates.",
257                i, c.name,
258            )));
259        }
260    }
261}
262
263/// Check that `[detector.verify.oob]` and `{{interactsh}}` template tokens
264/// are configured consistently:
265///
266/// - `oob` set but no `{{interactsh*}}` token anywhere in the verify
267///   templates → the wait_for parks for nothing; the probe never embeds
268///   the callback URL so the service can't reach our collector.
269/// - `{{interactsh*}}` token present but `oob` unset → the token resolves
270///   to an empty string at runtime, sending malformed requests (e.g.
271///   `https:///x` or a JSON body with `"target":""`).
272///
273/// Both are misconfigurations that load successfully but produce
274/// silently-wrong verify behavior. Fail-closed at the validator instead.
275fn check_oob_consistency(verify: &VerifySpec, issues: &mut Vec<QualityIssue>) {
276    let mut interactsh_referenced = false;
277    let mut scan = |s: &str| {
278        if s.contains("{{interactsh") {
279            interactsh_referenced = true;
280        }
281    };
282    if let Some(ref url) = verify.url {
283        scan(url);
284    }
285    if let Some(ref body) = verify.body {
286        scan(body);
287    }
288    for h in &verify.headers {
289        scan(&h.value);
290    }
291    for step in &verify.steps {
292        scan(&step.url);
293        if let Some(ref body) = step.body {
294            scan(body);
295        }
296        for h in &step.headers {
297            scan(&h.value);
298        }
299    }
300    let oob_configured = verify.oob.is_some();
301    match (oob_configured, interactsh_referenced) {
302        (true, false) => issues.push(QualityIssue::Error(
303            "verify.oob is set but no `{{interactsh}}` / `{{interactsh.host}}` / \
304             `{{interactsh.url}}` / `{{interactsh.id}}` token appears in any verify \
305             template - the OOB callback URL has nowhere to land, so the wait_for \
306             would always time out. Either embed an interactsh token in the body, \
307             URL, or a header - or remove the [detector.verify.oob] block."
308                .into(),
309        )),
310        (false, true) => issues.push(QualityIssue::Error(
311            "an `{{interactsh*}}` token is referenced in a verify template but no \
312             [detector.verify.oob] block is set - the token will resolve to an empty \
313             string at runtime and ship a malformed request to the service. Either \
314             add a [detector.verify.oob] block or remove the token."
315                .into(),
316        )),
317        _ => {}
318    }
319}
320
321/// Catch detectors whose `verify.url` is built from interpolation tokens
322/// without a fixed authoritative host AND without an explicit
323/// `allowed_domains` list. The verifier's runtime domain allowlist
324/// catches these at request time, but flagging at load time gives the
325/// detector author actionable feedback before the rule ships.
326/// kimi-wave3 §1 + §1.HIGH (single-brace `{var}` and `{{shop}}` cases).
327fn check_url_exfil_risk(url: &str, allowed_domains: &[String], issues: &mut Vec<QualityIssue>) {
328    // Detect `{{match}}` or `{{companion.*}}` taking the place of the
329    // authority component of the URL. Conservative match: anything that
330    // starts with the templated host (e.g. `https://{{...}}`, plain
331    // `{{match}}`, `https://{{...}}/path`).
332    let trimmed = url.trim();
333    let after_scheme = trimmed
334        .strip_prefix("https://")
335        .or_else(|| trimmed.strip_prefix("http://"))
336        .unwrap_or(trimmed);
337    let host_starts_with_template =
338        after_scheme.starts_with("{{") || after_scheme.starts_with("{") || trimmed == "{{match}}";
339    if host_starts_with_template && allowed_domains.is_empty() {
340        issues.push(QualityIssue::Error(
341            "verify URL host is templated and no `allowed_domains` is set - \
342             attacker-controlled interpolation could exfil credentials. \
343             Either hardcode the authoritative host in the URL or set \
344             `allowed_domains` explicitly. See kimi-wave3 §1."
345                .into(),
346        ));
347    }
348    // Single-brace `{name}` is a common author error - interpolate.rs
349    // only handles `{{...}}`, so `{name}` lands in the URL literally.
350    if url.contains('{') && !url.contains("{{") {
351        issues.push(QualityIssue::Error(
352            "verify URL uses single-brace `{var}` template syntax which the \
353             interpolator does NOT honor (only `{{var}}` works); the URL will \
354             be sent to a literal-string host. Use `{{companion.var}}`."
355                .into(),
356        ));
357    }
358}
359
360fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
361    if url.is_empty() {
362        issues.push(QualityIssue::Error("verify URL is empty".into()));
363    }
364    if url.starts_with("http://") && !url.contains("localhost") {
365        issues.push(QualityIssue::Warning(
366            "verify URL uses HTTP instead of HTTPS".into(),
367        ));
368    }
369}
370
371fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
372    let mut count = 0;
373    for ch in pattern.chars() {
374        match ch {
375            '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
376            _ => count += 1,
377        }
378    }
379    count >= min_len
380}
381
382fn is_pure_character_class(pattern: &str) -> bool {
383    let trimmed = pattern.trim();
384    if !trimmed.starts_with('[') {
385        return false;
386    }
387
388    let Some(close) = trimmed.find(']') else {
389        return false;
390    };
391    let remainder = trimmed[close + 1..].trim();
392    if remainder.is_empty() {
393        return true;
394    }
395    if remainder == "+" || remainder == "*" || remainder == "?" {
396        return true;
397    }
398    if remainder.starts_with('{') {
399        if let Some(qclose) = remainder.find('}') {
400            let after_quantifier = remainder[qclose + 1..].trim();
401            return after_quantifier.is_empty();
402        }
403    }
404
405    false
406}
407
408#[path = "validate_regex.rs"]
409mod validate_regex;
410use validate_regex::validate_regex_complexity;