1use super::DetectorSpec;
4use regex_syntax::ast::{self, Ast};
5use serde::Serialize;
6
7const MAX_REGEX_PATTERN_LEN: usize = 4096;
8const MAX_REGEX_AST_NODES: usize = 512;
9const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
10const MAX_REGEX_REPEAT_BOUND: u32 = 1_000;
11
12#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
23pub enum QualityIssue {
24 Error(String),
25 Warning(String),
26}
27
28pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
53 let mut issues = Vec::new();
54 validate_patterns_present(spec, &mut issues);
55 validate_regexes(spec, &mut issues);
56 validate_keywords(spec, &mut issues);
57 validate_pattern_specificity(spec, &mut issues);
58 validate_companions(spec, &mut issues);
59 validate_verify_spec(spec, &mut issues);
60 issues
61}
62
63fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
64 if spec.patterns.is_empty() {
65 issues.push(QualityIssue::Error("no patterns defined".into()));
66 }
67}
68
69fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
70 for (i, pat) in spec.patterns.iter().enumerate() {
71 validate_regex_definition("pattern", i, &pat.regex, issues);
72 }
73}
74
75fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
76 if spec.keywords.is_empty() {
77 issues.push(QualityIssue::Warning(
78 "no keywords defined — pattern may produce false positives".into(),
79 ));
80 }
81}
82
83fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
84 for (i, pat) in spec.patterns.iter().enumerate() {
85 let has_prefix = has_literal_prefix(&pat.regex, 3);
86 let has_group = pat.group.is_some();
87 let is_pure_charclass = is_pure_character_class(&pat.regex);
88
89 if is_pure_charclass && !has_group {
90 issues.push(QualityIssue::Error(format!(
91 "pattern {} is a pure character class ({}) — too broad without context anchoring. \
92 Use a capture group or add a literal prefix.",
93 i, pat.regex
94 )));
95 } else if !has_prefix && !has_group && spec.keywords.is_empty() {
96 issues.push(QualityIssue::Warning(format!(
97 "pattern {} has no literal prefix and no capture group — may false-positive",
98 i
99 )));
100 }
101 }
102}
103
104fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
105 for (i, companion) in spec.companions.iter().enumerate() {
106 if companion.name.trim().is_empty() {
107 issues.push(QualityIssue::Error(format!(
108 "companion {} name must not be empty",
109 i
110 )));
111 }
112 validate_regex_definition("companion", i, &companion.regex, issues);
113 if is_pure_character_class(&companion.regex) {
114 issues.push(QualityIssue::Error(format!(
115 "companion {} regex '{}' is a pure character class — add a literal context anchor",
116 i, companion.regex
117 )));
118 } else if !has_substantial_literal(&companion.regex, 3) {
119 issues.push(QualityIssue::Warning(format!(
120 "companion {} regex '{}' is too broad — may produce false positives. \
121 Add a context anchor like 'KEY_NAME='.",
122 i, companion.regex
123 )));
124 }
125 }
126}
127
128fn validate_regex_definition(
129 kind: &str,
130 index: usize,
131 regex: &str,
132 issues: &mut Vec<QualityIssue>,
133) {
134 if regex.len() > MAX_REGEX_PATTERN_LEN {
135 issues.push(QualityIssue::Error(format!(
136 "{kind} {index} regex is too large ({} bytes > {} byte limit)",
137 regex.len(),
138 MAX_REGEX_PATTERN_LEN
139 )));
140 return;
141 }
142
143 match ast::parse::Parser::new().parse(regex) {
144 Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
145 Err(error) => issues.push(QualityIssue::Error(format!(
146 "{kind} {index} regex does not compile: {error}"
147 ))),
148 }
149}
150
151fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
152 let mut max_literal_len = 0;
153 let mut current_literal_len = 0;
154 let mut in_escape = false;
155 let mut in_char_class = false;
156
157 for ch in pattern.chars() {
158 if in_escape {
159 if is_escaped_literal(ch) {
160 current_literal_len += 1;
161 } else {
162 max_literal_len = max_literal_len.max(current_literal_len);
163 current_literal_len = 0;
164 }
165 in_escape = false;
166 continue;
167 }
168
169 match ch {
170 '\\' => in_escape = true,
171 '[' => {
172 max_literal_len = max_literal_len.max(current_literal_len);
173 current_literal_len = 0;
174 in_char_class = true;
175 }
176 ']' => {
177 in_char_class = false;
178 }
179 '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
180 max_literal_len = max_literal_len.max(current_literal_len);
181 current_literal_len = 0;
182 }
183 _ => {
184 if !in_char_class {
185 current_literal_len += 1;
186 }
187 }
188 }
189 }
190 max_literal_len = max_literal_len.max(current_literal_len);
191 max_literal_len >= min_len
192}
193
194fn is_escaped_literal(ch: char) -> bool {
195 matches!(
196 ch,
197 '[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
198 )
199}
200
201fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
202 if let Some(ref verify) = spec.verify {
203 if !verify.steps.is_empty() {
205 for step in &verify.steps {
206 validate_url(&step.url, issues);
207 }
208 } else if let Some(ref url) = verify.url {
209 validate_url(url, issues);
210 } else {
211 issues.push(QualityIssue::Error(
212 "verify spec has no steps and no default URL".into(),
213 ));
214 }
215 }
216}
217
218fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
219 if url.is_empty() {
220 issues.push(QualityIssue::Error("verify URL is empty".into()));
221 }
222 if url.starts_with("http://") && !url.contains("localhost") {
223 issues.push(QualityIssue::Warning(
224 "verify URL uses HTTP instead of HTTPS".into(),
225 ));
226 }
227}
228
229fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
230 let mut count = 0;
231 for ch in pattern.chars() {
232 match ch {
233 '[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
234 _ => count += 1,
235 }
236 }
237 count >= min_len
238}
239
240fn is_pure_character_class(pattern: &str) -> bool {
241 let trimmed = pattern.trim();
242 if !trimmed.starts_with('[') {
243 return false;
244 }
245
246 let Some(close) = trimmed.find(']') else {
247 return false;
248 };
249 let remainder = trimmed[close + 1..].trim();
250 if remainder.is_empty() {
251 return true;
252 }
253 if remainder == "+" || remainder == "*" || remainder == "?" {
254 return true;
255 }
256 if remainder.starts_with('{')
257 && let Some(qclose) = remainder.find('}')
258 {
259 let after_quantifier = remainder[qclose + 1..].trim();
260 return after_quantifier.is_empty();
261 }
262
263 false
264}
265
266fn validate_regex_complexity(kind: &str, index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
267 let mut stats = RegexComplexityStats::default();
268 collect_regex_complexity(ast, &mut stats);
269 collect_redos_risks(ast, &mut stats, false);
270
271 if stats.nodes > MAX_REGEX_AST_NODES {
272 issues.push(QualityIssue::Error(format!(
273 "{kind} {index} regex is too complex ({} AST nodes > {} limit)",
274 stats.nodes, MAX_REGEX_AST_NODES
275 )));
276 }
277
278 if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
279 issues.push(QualityIssue::Error(format!(
280 "{kind} {index} regex has too many alternation branches ({} > {} limit)",
281 stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
282 )));
283 }
284
285 if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
286 issues.push(QualityIssue::Error(format!(
287 "{kind} {index} regex has an excessive counted repetition bound ({} > {} limit)",
288 stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
289 )));
290 }
291
292 if stats.has_nested_quantifier {
293 issues.push(QualityIssue::Error(format!(
294 "{kind} {index} regex contains nested quantifiers that can trigger pathological matching"
295 )));
296 }
297
298 if stats.has_quantified_overlapping_alternation {
299 issues.push(QualityIssue::Error(format!(
300 "{kind} {index} regex repeats overlapping alternations; use unambiguous branches instead"
301 )));
302 }
303}
304
305#[derive(Default)]
306struct RegexComplexityStats {
307 nodes: usize,
308 max_alternation_branches: usize,
309 max_repeat_bound: u32,
310 has_nested_quantifier: bool,
311 has_quantified_overlapping_alternation: bool,
312}
313
314fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
315 stats.nodes += 1;
316 match ast {
317 Ast::Repetition(repetition) => {
318 update_repeat_bound(&repetition.op.kind, stats);
319 collect_regex_complexity(&repetition.ast, stats);
320 }
321 Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
322 Ast::Alternation(alternation) => {
323 stats.max_alternation_branches =
324 stats.max_alternation_branches.max(alternation.asts.len());
325 for ast in &alternation.asts {
326 collect_regex_complexity(ast, stats);
327 }
328 }
329 Ast::Concat(concat) => {
330 for ast in &concat.asts {
331 collect_regex_complexity(ast, stats);
332 }
333 }
334 Ast::Empty(_)
335 | Ast::Flags(_)
336 | Ast::Literal(_)
337 | Ast::Dot(_)
338 | Ast::Assertion(_)
339 | Ast::ClassUnicode(_)
340 | Ast::ClassPerl(_)
341 | Ast::ClassBracketed(_) => {}
342 }
343}
344
345fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
346 match ast {
347 Ast::Repetition(repetition) => {
348 let this_is_simple_atom = matches!(
362 &*repetition.ast,
363 Ast::Literal(_)
364 | Ast::Dot(_)
365 | Ast::ClassBracketed(_)
366 | Ast::ClassPerl(_)
367 | Ast::ClassUnicode(_)
368 );
369 let this_is_unbounded = matches!(
370 repetition.op.kind,
371 ast::RepetitionKind::ZeroOrMore
372 | ast::RepetitionKind::OneOrMore
373 | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
374 );
375 if inside_repetition && !this_is_simple_atom && this_is_unbounded {
379 stats.has_nested_quantifier = true;
380 }
381 if !inside_repetition
382 && this_is_unbounded
383 && !this_is_simple_atom
384 && ast_contains_repetition(&repetition.ast)
385 {
386 stats.has_nested_quantifier = true;
387 }
388 if alternation_has_overlapping_prefixes(&repetition.ast) {
389 stats.has_quantified_overlapping_alternation = true;
390 }
391 collect_redos_risks(
393 &repetition.ast,
394 stats,
395 inside_repetition || this_is_unbounded,
396 );
397 }
398 Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
399 Ast::Alternation(alternation) => {
400 for ast in &alternation.asts {
401 collect_redos_risks(ast, stats, inside_repetition);
402 }
403 }
404 Ast::Concat(concat) => {
405 for ast in &concat.asts {
406 collect_redos_risks(ast, stats, inside_repetition);
407 }
408 }
409 Ast::Empty(_)
410 | Ast::Flags(_)
411 | Ast::Literal(_)
412 | Ast::Dot(_)
413 | Ast::Assertion(_)
414 | Ast::ClassUnicode(_)
415 | Ast::ClassPerl(_)
416 | Ast::ClassBracketed(_) => {}
417 }
418}
419
420fn ast_contains_repetition(ast: &Ast) -> bool {
421 match ast {
422 Ast::Repetition(_) => true,
423 Ast::Group(group) => ast_contains_repetition(&group.ast),
424 Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
425 Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
426 Ast::Empty(_)
427 | Ast::Flags(_)
428 | Ast::Literal(_)
429 | Ast::Dot(_)
430 | Ast::Assertion(_)
431 | Ast::ClassUnicode(_)
432 | Ast::ClassPerl(_)
433 | Ast::ClassBracketed(_) => false,
434 }
435}
436
437fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
438 let alternatives = match ast {
439 Ast::Alternation(alternation) => &alternation.asts,
440 Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
441 _ => return false,
442 };
443
444 let prefixes = alternatives
445 .iter()
446 .filter_map(literalish_prefix)
447 .collect::<Vec<_>>();
448 for (idx, prefix) in prefixes.iter().enumerate() {
449 for other in prefixes.iter().skip(idx + 1) {
450 if prefix.starts_with(other) || other.starts_with(prefix) {
451 return true;
452 }
453 }
454 }
455 false
456}
457
458fn literalish_prefix(ast: &Ast) -> Option<String> {
459 match ast {
460 Ast::Literal(literal) => Some(literal.c.to_string()),
461 Ast::Concat(concat) => {
462 let mut prefix = String::new();
463 for node in &concat.asts {
464 match node {
465 Ast::Literal(literal) => prefix.push(literal.c),
466 Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
467 _ => break,
468 }
469 }
470 (!prefix.is_empty()).then_some(prefix)
471 }
472 Ast::Group(group) => literalish_prefix(&group.ast),
473 _ => None,
474 }
475}
476
477fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
478 let bound = match kind {
479 ast::RepetitionKind::ZeroOrOne => 1,
480 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
481 ast::RepetitionKind::Range(range) => match range {
482 ast::RepetitionRange::Exactly(max)
483 | ast::RepetitionRange::AtLeast(max)
484 | ast::RepetitionRange::Bounded(_, max) => *max,
485 },
486 };
487 stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
488}
489
490#[cfg(test)]
491mod tests {
492 use super::*;
493 use crate::Severity;
494
495 fn detector_with_pattern(regex: &str) -> DetectorSpec {
496 DetectorSpec {
497 id: "test-detector".into(),
498 name: "Test Detector".into(),
499 service: "test".into(),
500 severity: Severity::High,
501 keywords: vec!["token".into()],
502 patterns: vec![crate::PatternSpec {
503 regex: regex.into(),
504 description: None,
505 group: None,
506 }],
507 verify: None,
508 companions: Vec::new(),
509 }
510 }
511
512 #[test]
513 fn rejects_excessive_alternation_fanout() {
514 let regex = (0..65)
515 .map(|i| format!("opt{i}"))
516 .collect::<Vec<_>>()
517 .join("|");
518 let issues = validate_detector(&detector_with_pattern(®ex));
519
520 assert!(issues.iter().any(|issue| matches!(
521 issue,
522 QualityIssue::Error(message) if message.contains("alternation branches")
523 )));
524 }
525
526 #[test]
527 fn rejects_excessive_counted_repetition() {
528 let issues = validate_detector(&detector_with_pattern("token[a-z]{10001}"));
529
530 assert!(issues.iter().any(|issue| matches!(
531 issue,
532 QualityIssue::Error(message) if message.contains("counted repetition bound")
533 )));
534 }
535
536 #[test]
537 fn rejects_nested_quantifiers() {
538 let issues = validate_detector(&detector_with_pattern("(a+)+b"));
539
540 assert!(issues.iter().any(|issue| matches!(
541 issue,
542 QualityIssue::Error(message) if message.contains("nested quantifiers")
543 )));
544 }
545
546 #[test]
547 fn rejects_quantified_overlapping_alternation() {
548 let issues = validate_detector(&detector_with_pattern("(ab|a)+z"));
549
550 assert!(issues.iter().any(|issue| matches!(
551 issue,
552 QualityIssue::Error(message) if message.contains("overlapping alternations")
553 )));
554 }
555
556 #[test]
557 fn rejects_invalid_companion_regexes() {
558 let mut detector = detector_with_pattern("token_[A-Z0-9]{8}");
559 detector.companions.push(crate::CompanionSpec {
560 name: "secret".into(),
561 regex: "(".into(),
562 within_lines: 3,
563 required: false,
564 });
565
566 let issues = validate_detector(&detector);
567 assert!(issues.iter().any(|issue| matches!(
568 issue,
569 QualityIssue::Error(message)
570 if message.contains("companion 0 regex does not compile")
571 )));
572 }
573
574 #[test]
575 fn rejects_broad_companion_character_class() {
576 let mut detector = detector_with_pattern("token_[A-Z0-9]{8}");
577 detector.companions.push(crate::CompanionSpec {
578 name: "secret".into(),
579 regex: "[A-Za-z0-9+/=]{40,}".into(),
580 within_lines: 3,
581 required: false,
582 });
583
584 let issues = validate_detector(&detector);
585 assert!(issues.iter().any(|issue| matches!(
586 issue,
587 QualityIssue::Error(message) if message.contains("pure character class")
588 )));
589 }
590}