1use super::DetectorSpec;
4use regex_syntax::ast::{self, Ast};
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8const MAX_REGEX_AST_NODES: usize = 512;
9const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
10const MAX_REGEX_REPEAT_BOUND: u32 = 1_000;
11
12#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub enum QualityIssue {
24 Error(String),
25 Warning(String),
26}
27
28pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
53 let mut issues = Vec::new();
54 validate_patterns_present(spec, &mut issues);
55 validate_regexes(spec, &mut issues);
56 validate_keywords(spec, &mut issues);
57 validate_pattern_specificity(spec, &mut issues);
58 validate_companions(spec, &mut issues);
59 validate_verify_spec(spec, &mut issues);
60 issues
61}
62
63fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
64 if spec.patterns.is_empty() {
65 issues.push(QualityIssue::Error("no patterns defined".into()));
66 }
67}
68
69fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
70 for (i, pat) in spec.patterns.iter().enumerate() {
71 validate_regex_definition("pattern", i, &pat.regex, issues);
72 }
73}
74
75fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
76 if spec.keywords.is_empty() {
77 issues.push(QualityIssue::Warning(
78 "no keywords defined — pattern may produce false positives".into(),
79 ));
80 }
81}
82
83fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
84 for (i, pat) in spec.patterns.iter().enumerate() {
85 let has_prefix = has_literal_prefix(&pat.regex, 3);
86 let has_group = pat.group.is_some();
87 let is_pure_charclass = is_pure_character_class(&pat.regex);
88
89 if is_pure_charclass && !has_group {
90 issues.push(QualityIssue::Error(format!(
91 "pattern {} is a pure character class ({}) — too broad without context anchoring. \
92 Use a capture group or add a literal prefix.",
93 i, pat.regex
94 )));
95 } else if !has_prefix && !has_group && spec.keywords.is_empty() {
96 issues.push(QualityIssue::Warning(format!(
97 "pattern {} has no literal prefix and no capture group — may false-positive",
98 i
99 )));
100 }
101 }
102}
103
104fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
105 for (i, companion) in spec.companions.iter().enumerate() {
106 if companion.name.trim().is_empty() {
107 issues.push(QualityIssue::Error(format!(
108 "companion {} name must not be empty",
109 i
110 )));
111 }
112 validate_regex_definition("companion", i, &companion.regex, issues);
113 if is_pure_character_class(&companion.regex) {
119 if companion.within_lines <= TIGHT_COMPANION_RADIUS {
120 issues.push(QualityIssue::Warning(format!(
121 "companion {} regex '{}' is a pure character class; \
122 allowed because within_lines={} ≤ {} (positional anchoring).",
123 i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
124 )));
125 } else {
126 issues.push(QualityIssue::Error(format!(
127 "companion {} regex '{}' is a pure character class with within_lines={} \
128 (> {}) — the wide search radius needs a literal context anchor",
129 i, companion.regex, companion.within_lines, TIGHT_COMPANION_RADIUS
130 )));
131 }
132 } else if !has_substantial_literal(&companion.regex, 3) {
133 issues.push(QualityIssue::Warning(format!(
134 "companion {} regex '{}' is too broad — may produce false positives. \
135 Add a context anchor like 'KEY_NAME='.",
136 i, companion.regex
137 )));
138 }
139 }
140}
141
142const TIGHT_COMPANION_RADIUS: usize = 5;
145
146fn validate_regex_definition(
147 kind: &str,
148 index: usize,
149 regex: &str,
150 issues: &mut Vec<QualityIssue>,
151) {
152 if regex.len() > MAX_REGEX_PATTERN_LEN {
153 issues.push(QualityIssue::Error(format!(
154 "{kind} {index} regex is too large ({} bytes > {} byte limit)",
155 regex.len(),
156 MAX_REGEX_PATTERN_LEN
157 )));
158 return;
159 }
160
161 match ast::parse::Parser::new().parse(regex) {
162 Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
163 Err(error) => issues.push(QualityIssue::Error(format!(
164 "{kind} {index} regex does not compile: {error}"
165 ))),
166 }
167}
168
169fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
170 let mut max_literal_len = 0;
171 let mut current_literal_len = 0;
172 let mut in_escape = false;
173 let mut in_char_class = false;
174
175 for ch in pattern.chars() {
176 if in_escape {
177 if is_escaped_literal(ch) {
178 current_literal_len += 1;
179 } else {
180 max_literal_len = max_literal_len.max(current_literal_len);
181 current_literal_len = 0;
182 }
183 in_escape = false;
184 continue;
185 }
186
187 match ch {
188 '\\' => in_escape = true,
189 '[' => {
190 max_literal_len = max_literal_len.max(current_literal_len);
191 current_literal_len = 0;
192 in_char_class = true;
193 }
194 ']' => {
195 in_char_class = false;
196 }
197 '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
198 max_literal_len = max_literal_len.max(current_literal_len);
199 current_literal_len = 0;
200 }
201 _ => {
202 if !in_char_class {
203 current_literal_len += 1;
204 }
205 }
206 }
207 }
208 max_literal_len = max_literal_len.max(current_literal_len);
209 max_literal_len >= min_len
210}
211
212fn is_escaped_literal(ch: char) -> bool {
213 matches!(
214 ch,
215 '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
216 )
217}
218
219fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
220 if let Some(ref verify) = spec.verify {
221 if !verify.steps.is_empty() {
223 for step in &verify.steps {
224 validate_url(&step.url, issues);
225 check_url_exfil_risk(&step.url, &verify.allowed_domains, issues);
226 }
227 } else if let Some(ref url) = verify.url {
228 validate_url(url, issues);
229 check_url_exfil_risk(url, &verify.allowed_domains, issues);
230 } else {
231 issues.push(QualityIssue::Error(
232 "verify spec has no steps and no default URL".into(),
233 ));
234 }
235 }
236}
237
238fn check_url_exfil_risk(url: &str, allowed_domains: &[String], issues: &mut Vec<QualityIssue>) {
245 let trimmed = url.trim();
250 let after_scheme = trimmed
251 .strip_prefix("https://")
252 .or_else(|| trimmed.strip_prefix("http://"))
253 .unwrap_or(trimmed);
254 let host_starts_with_template =
255 after_scheme.starts_with("{{") || after_scheme.starts_with("{") || trimmed == "{{match}}";
256 if host_starts_with_template && allowed_domains.is_empty() {
257 issues.push(QualityIssue::Error(
258 "verify URL host is templated and no `allowed_domains` is set — \
259 attacker-controlled interpolation could exfil credentials. \
260 Either hardcode the authoritative host in the URL or set \
261 `allowed_domains` explicitly. See kimi-wave3 §1."
262 .into(),
263 ));
264 }
265 if url.contains('{') && !url.contains("{{") {
268 issues.push(QualityIssue::Error(
269 "verify URL uses single-brace `{var}` template syntax which the \
270 interpolator does NOT honor (only `{{var}}` works); the URL will \
271 be sent to a literal-string host. Use `{{companion.var}}`."
272 .into(),
273 ));
274 }
275}
276
277fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
278 if url.is_empty() {
279 issues.push(QualityIssue::Error("verify URL is empty".into()));
280 }
281 if url.starts_with("http://") && !url.contains("localhost") {
282 issues.push(QualityIssue::Warning(
283 "verify URL uses HTTP instead of HTTPS".into(),
284 ));
285 }
286}
287
288fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
289 let mut count = 0;
290 for ch in pattern.chars() {
291 match ch {
292 '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
293 _ => count += 1,
294 }
295 }
296 count >= min_len
297}
298
299fn is_pure_character_class(pattern: &str) -> bool {
300 let trimmed = pattern.trim();
301 if !trimmed.starts_with('[') {
302 return false;
303 }
304
305 let Some(close) = trimmed.find(']') else {
306 return false;
307 };
308 let remainder = trimmed[close + 1..].trim();
309 if remainder.is_empty() {
310 return true;
311 }
312 if remainder == "+" || remainder == "*" || remainder == "?" {
313 return true;
314 }
315 if remainder.starts_with('{') {
316 if let Some(qclose) = remainder.find('}') {
317 let after_quantifier = remainder[qclose + 1..].trim();
318 return after_quantifier.is_empty();
319 }
320 }
321
322 false
323}
324
325fn validate_regex_complexity(kind: &str, index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
326 let mut stats = RegexComplexityStats::default();
327 collect_regex_complexity(ast, &mut stats);
328 collect_redos_risks(ast, &mut stats, false);
329
330 if stats.nodes > MAX_REGEX_AST_NODES {
331 issues.push(QualityIssue::Error(format!(
332 "{kind} {index} regex is too complex ({} AST nodes > {} limit)",
333 stats.nodes, MAX_REGEX_AST_NODES
334 )));
335 }
336
337 if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
338 issues.push(QualityIssue::Error(format!(
339 "{kind} {index} regex has too many alternation branches ({} > {} limit)",
340 stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
341 )));
342 }
343
344 if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
345 issues.push(QualityIssue::Error(format!(
346 "{kind} {index} regex has an excessive counted repetition bound ({} > {} limit)",
347 stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
348 )));
349 }
350
351 if stats.has_nested_quantifier {
352 issues.push(QualityIssue::Error(format!(
353 "{kind} {index} regex contains nested quantifiers that can trigger pathological matching"
354 )));
355 }
356
357 if stats.has_quantified_overlapping_alternation {
358 issues.push(QualityIssue::Error(format!(
359 "{kind} {index} regex repeats overlapping alternations; use unambiguous branches instead"
360 )));
361 }
362}
363
364#[derive(Default)]
365struct RegexComplexityStats {
366 nodes: usize,
367 max_alternation_branches: usize,
368 max_repeat_bound: u32,
369 has_nested_quantifier: bool,
370 has_quantified_overlapping_alternation: bool,
371}
372
373fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
374 stats.nodes += 1;
375 match ast {
376 Ast::Repetition(repetition) => {
377 update_repeat_bound(&repetition.op.kind, stats);
378 collect_regex_complexity(&repetition.ast, stats);
379 }
380 Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
381 Ast::Alternation(alternation) => {
382 stats.max_alternation_branches =
383 stats.max_alternation_branches.max(alternation.asts.len());
384 for ast in &alternation.asts {
385 collect_regex_complexity(ast, stats);
386 }
387 }
388 Ast::Concat(concat) => {
389 for ast in &concat.asts {
390 collect_regex_complexity(ast, stats);
391 }
392 }
393 Ast::Empty(_)
394 | Ast::Flags(_)
395 | Ast::Literal(_)
396 | Ast::Dot(_)
397 | Ast::Assertion(_)
398 | Ast::ClassUnicode(_)
399 | Ast::ClassPerl(_)
400 | Ast::ClassBracketed(_) => {}
401 }
402}
403
404fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
405 match ast {
406 Ast::Repetition(repetition) => {
407 let this_is_simple_atom = matches!(
421 &*repetition.ast,
422 Ast::Literal(_)
423 | Ast::Dot(_)
424 | Ast::ClassBracketed(_)
425 | Ast::ClassPerl(_)
426 | Ast::ClassUnicode(_)
427 );
428 let this_is_unbounded = matches!(
429 repetition.op.kind,
430 ast::RepetitionKind::ZeroOrMore
431 | ast::RepetitionKind::OneOrMore
432 | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
433 );
434 if inside_repetition && !this_is_simple_atom && this_is_unbounded {
438 stats.has_nested_quantifier = true;
439 }
440 if !inside_repetition
441 && this_is_unbounded
442 && !this_is_simple_atom
443 && ast_contains_repetition(&repetition.ast)
444 {
445 stats.has_nested_quantifier = true;
446 }
447 if alternation_has_overlapping_prefixes(&repetition.ast) {
448 stats.has_quantified_overlapping_alternation = true;
449 }
450 collect_redos_risks(
452 &repetition.ast,
453 stats,
454 inside_repetition || this_is_unbounded,
455 );
456 }
457 Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
458 Ast::Alternation(alternation) => {
459 for ast in &alternation.asts {
460 collect_redos_risks(ast, stats, inside_repetition);
461 }
462 }
463 Ast::Concat(concat) => {
464 for ast in &concat.asts {
465 collect_redos_risks(ast, stats, inside_repetition);
466 }
467 }
468 Ast::Empty(_)
469 | Ast::Flags(_)
470 | Ast::Literal(_)
471 | Ast::Dot(_)
472 | Ast::Assertion(_)
473 | Ast::ClassUnicode(_)
474 | Ast::ClassPerl(_)
475 | Ast::ClassBracketed(_) => {}
476 }
477}
478
479fn ast_contains_repetition(ast: &Ast) -> bool {
480 match ast {
481 Ast::Repetition(_) => true,
482 Ast::Group(group) => ast_contains_repetition(&group.ast),
483 Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
484 Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
485 Ast::Empty(_)
486 | Ast::Flags(_)
487 | Ast::Literal(_)
488 | Ast::Dot(_)
489 | Ast::Assertion(_)
490 | Ast::ClassUnicode(_)
491 | Ast::ClassPerl(_)
492 | Ast::ClassBracketed(_) => false,
493 }
494}
495
496fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
497 let alternatives = match ast {
498 Ast::Alternation(alternation) => &alternation.asts,
499 Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
500 _ => return false,
501 };
502
503 let prefixes = alternatives
504 .iter()
505 .filter_map(literalish_prefix)
506 .collect::<Vec<_>>();
507 for (idx, prefix) in prefixes.iter().enumerate() {
508 for other in prefixes.iter().skip(idx + 1) {
509 if prefix.starts_with(other) || other.starts_with(prefix) {
510 return true;
511 }
512 }
513 }
514 false
515}
516
517fn literalish_prefix(ast: &Ast) -> Option<String> {
518 match ast {
519 Ast::Literal(literal) => Some(literal.c.to_string()),
520 Ast::Concat(concat) => {
521 let mut prefix = String::new();
522 for node in &concat.asts {
523 match node {
524 Ast::Literal(literal) => prefix.push(literal.c),
525 Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
526 _ => break,
527 }
528 }
529 (!prefix.is_empty()).then_some(prefix)
530 }
531 Ast::Group(group) => literalish_prefix(&group.ast),
532 _ => None,
533 }
534}
535
536fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
537 let bound = match kind {
538 ast::RepetitionKind::ZeroOrOne => 1,
539 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
540 ast::RepetitionKind::Range(range) => match range {
541 ast::RepetitionRange::Exactly(max)
542 | ast::RepetitionRange::AtLeast(max)
543 | ast::RepetitionRange::Bounded(_, max) => *max,
544 },
545 };
546 stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
547}