use super::DetectorSpec;
use regex_syntax::ast::{self, Ast};
use serde::Serialize;
const MAX_REGEX_PATTERN_LEN: usize = 4096;
const MAX_REGEX_AST_NODES: usize = 512;
const MAX_REGEX_ALTERNATION_BRANCHES: usize = 64;
const MAX_REGEX_REPEAT_BOUND: u32 = 1_000;
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub enum QualityIssue {
Error(String),
Warning(String),
}
pub fn validate_detector(spec: &DetectorSpec) -> Vec<QualityIssue> {
let mut issues = Vec::new();
validate_patterns_present(spec, &mut issues);
validate_regexes(spec, &mut issues);
validate_keywords(spec, &mut issues);
validate_pattern_specificity(spec, &mut issues);
validate_companions(spec, &mut issues);
validate_verify_spec(spec, &mut issues);
issues
}
fn validate_patterns_present(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
if spec.patterns.is_empty() {
issues.push(QualityIssue::Error("no patterns defined".into()));
}
}
fn validate_regexes(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
for (i, pat) in spec.patterns.iter().enumerate() {
validate_regex_definition("pattern", i, &pat.regex, issues);
}
}
fn validate_keywords(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
if spec.keywords.is_empty() {
issues.push(QualityIssue::Warning(
"no keywords defined — pattern may produce false positives".into(),
));
}
}
fn validate_pattern_specificity(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
for (i, pat) in spec.patterns.iter().enumerate() {
let has_prefix = has_literal_prefix(&pat.regex, 3);
let has_group = pat.group.is_some();
let is_pure_charclass = is_pure_character_class(&pat.regex);
if is_pure_charclass && !has_group {
issues.push(QualityIssue::Error(format!(
"pattern {} is a pure character class ({}) — too broad without context anchoring. \
Use a capture group or add a literal prefix.",
i, pat.regex
)));
} else if !has_prefix && !has_group && spec.keywords.is_empty() {
issues.push(QualityIssue::Warning(format!(
"pattern {} has no literal prefix and no capture group — may false-positive",
i
)));
}
}
}
fn validate_companions(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
for (i, companion) in spec.companions.iter().enumerate() {
if companion.name.trim().is_empty() {
issues.push(QualityIssue::Error(format!(
"companion {} name must not be empty",
i
)));
}
validate_regex_definition("companion", i, &companion.regex, issues);
if is_pure_character_class(&companion.regex) {
issues.push(QualityIssue::Error(format!(
"companion {} regex '{}' is a pure character class — add a literal context anchor",
i, companion.regex
)));
} else if !has_substantial_literal(&companion.regex, 3) {
issues.push(QualityIssue::Warning(format!(
"companion {} regex '{}' is too broad — may produce false positives. \
Add a context anchor like 'KEY_NAME='.",
i, companion.regex
)));
}
}
}
fn validate_regex_definition(
kind: &str,
index: usize,
regex: &str,
issues: &mut Vec<QualityIssue>,
) {
if regex.len() > MAX_REGEX_PATTERN_LEN {
issues.push(QualityIssue::Error(format!(
"{kind} {index} regex is too large ({} bytes > {} byte limit)",
regex.len(),
MAX_REGEX_PATTERN_LEN
)));
return;
}
match ast::parse::Parser::new().parse(regex) {
Ok(ast) => validate_regex_complexity(kind, index, &ast, issues),
Err(error) => issues.push(QualityIssue::Error(format!(
"{kind} {index} regex does not compile: {error}"
))),
}
}
fn has_substantial_literal(pattern: &str, min_len: usize) -> bool {
let mut max_literal_len = 0;
let mut current_literal_len = 0;
let mut in_escape = false;
let mut in_char_class = false;
for ch in pattern.chars() {
if in_escape {
if is_escaped_literal(ch) {
current_literal_len += 1;
} else {
max_literal_len = max_literal_len.max(current_literal_len);
current_literal_len = 0;
}
in_escape = false;
continue;
}
match ch {
'\\' => in_escape = true,
'[' => {
max_literal_len = max_literal_len.max(current_literal_len);
current_literal_len = 0;
in_char_class = true;
}
']' => {
in_char_class = false;
}
'(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '|' | '^' | '$' => {
max_literal_len = max_literal_len.max(current_literal_len);
current_literal_len = 0;
}
_ => {
if !in_char_class {
current_literal_len += 1;
}
}
}
}
max_literal_len = max_literal_len.max(current_literal_len);
max_literal_len >= min_len
}
fn is_escaped_literal(ch: char) -> bool {
matches!(
ch,
'[' | ']' | '(' | ')' | '.' | '*' | '+' | '?' | '{' | '}' | '\\' | '|' | '^' | '$'
)
}
fn validate_verify_spec(spec: &DetectorSpec, issues: &mut Vec<QualityIssue>) {
if let Some(ref verify) = spec.verify {
if !verify.steps.is_empty() {
for step in &verify.steps {
validate_url(&step.url, issues);
}
} else if let Some(ref url) = verify.url {
validate_url(url, issues);
} else {
issues.push(QualityIssue::Error(
"verify spec has no steps and no default URL".into(),
));
}
}
}
fn validate_url(url: &str, issues: &mut Vec<QualityIssue>) {
if url.is_empty() {
issues.push(QualityIssue::Error("verify URL is empty".into()));
}
if url.starts_with("http://") && !url.contains("localhost") {
issues.push(QualityIssue::Warning(
"verify URL uses HTTP instead of HTTPS".into(),
));
}
}
fn has_literal_prefix(pattern: &str, min_len: usize) -> bool {
let mut count = 0;
for ch in pattern.chars() {
match ch {
'[' | '(' | '.' | '*' | '+' | '?' | '{' | '\\' | '|' | '^' | '$' => break,
_ => count += 1,
}
}
count >= min_len
}
fn is_pure_character_class(pattern: &str) -> bool {
let trimmed = pattern.trim();
if !trimmed.starts_with('[') {
return false;
}
let Some(close) = trimmed.find(']') else {
return false;
};
let remainder = trimmed[close + 1..].trim();
if remainder.is_empty() {
return true;
}
if remainder == "+" || remainder == "*" || remainder == "?" {
return true;
}
if remainder.starts_with('{')
&& let Some(qclose) = remainder.find('}')
{
let after_quantifier = remainder[qclose + 1..].trim();
return after_quantifier.is_empty();
}
false
}
fn validate_regex_complexity(kind: &str, index: usize, ast: &Ast, issues: &mut Vec<QualityIssue>) {
let mut stats = RegexComplexityStats::default();
collect_regex_complexity(ast, &mut stats);
collect_redos_risks(ast, &mut stats, false);
if stats.nodes > MAX_REGEX_AST_NODES {
issues.push(QualityIssue::Error(format!(
"{kind} {index} regex is too complex ({} AST nodes > {} limit)",
stats.nodes, MAX_REGEX_AST_NODES
)));
}
if stats.max_alternation_branches > MAX_REGEX_ALTERNATION_BRANCHES {
issues.push(QualityIssue::Error(format!(
"{kind} {index} regex has too many alternation branches ({} > {} limit)",
stats.max_alternation_branches, MAX_REGEX_ALTERNATION_BRANCHES
)));
}
if stats.max_repeat_bound > MAX_REGEX_REPEAT_BOUND {
issues.push(QualityIssue::Error(format!(
"{kind} {index} regex has an excessive counted repetition bound ({} > {} limit)",
stats.max_repeat_bound, MAX_REGEX_REPEAT_BOUND
)));
}
if stats.has_nested_quantifier {
issues.push(QualityIssue::Error(format!(
"{kind} {index} regex contains nested quantifiers that can trigger pathological matching"
)));
}
if stats.has_quantified_overlapping_alternation {
issues.push(QualityIssue::Error(format!(
"{kind} {index} regex repeats overlapping alternations; use unambiguous branches instead"
)));
}
}
#[derive(Default)]
struct RegexComplexityStats {
nodes: usize,
max_alternation_branches: usize,
max_repeat_bound: u32,
has_nested_quantifier: bool,
has_quantified_overlapping_alternation: bool,
}
fn collect_regex_complexity(ast: &Ast, stats: &mut RegexComplexityStats) {
stats.nodes += 1;
match ast {
Ast::Repetition(repetition) => {
update_repeat_bound(&repetition.op.kind, stats);
collect_regex_complexity(&repetition.ast, stats);
}
Ast::Group(group) => collect_regex_complexity(&group.ast, stats),
Ast::Alternation(alternation) => {
stats.max_alternation_branches =
stats.max_alternation_branches.max(alternation.asts.len());
for ast in &alternation.asts {
collect_regex_complexity(ast, stats);
}
}
Ast::Concat(concat) => {
for ast in &concat.asts {
collect_regex_complexity(ast, stats);
}
}
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
| Ast::ClassBracketed(_) => {}
}
}
fn collect_redos_risks(ast: &Ast, stats: &mut RegexComplexityStats, inside_repetition: bool) {
match ast {
Ast::Repetition(repetition) => {
let this_is_simple_atom = matches!(
&*repetition.ast,
Ast::Literal(_)
| Ast::Dot(_)
| Ast::ClassBracketed(_)
| Ast::ClassPerl(_)
| Ast::ClassUnicode(_)
);
let this_is_unbounded = matches!(
repetition.op.kind,
ast::RepetitionKind::ZeroOrMore
| ast::RepetitionKind::OneOrMore
| ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast { .. })
);
if inside_repetition && !this_is_simple_atom && this_is_unbounded {
stats.has_nested_quantifier = true;
}
if !inside_repetition
&& this_is_unbounded
&& !this_is_simple_atom
&& ast_contains_repetition(&repetition.ast)
{
stats.has_nested_quantifier = true;
}
if alternation_has_overlapping_prefixes(&repetition.ast) {
stats.has_quantified_overlapping_alternation = true;
}
collect_redos_risks(
&repetition.ast,
stats,
inside_repetition || this_is_unbounded,
);
}
Ast::Group(group) => collect_redos_risks(&group.ast, stats, inside_repetition),
Ast::Alternation(alternation) => {
for ast in &alternation.asts {
collect_redos_risks(ast, stats, inside_repetition);
}
}
Ast::Concat(concat) => {
for ast in &concat.asts {
collect_redos_risks(ast, stats, inside_repetition);
}
}
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
| Ast::ClassBracketed(_) => {}
}
}
fn ast_contains_repetition(ast: &Ast) -> bool {
match ast {
Ast::Repetition(_) => true,
Ast::Group(group) => ast_contains_repetition(&group.ast),
Ast::Alternation(alternation) => alternation.asts.iter().any(ast_contains_repetition),
Ast::Concat(concat) => concat.asts.iter().any(ast_contains_repetition),
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
| Ast::ClassBracketed(_) => false,
}
}
fn alternation_has_overlapping_prefixes(ast: &Ast) -> bool {
let alternatives = match ast {
Ast::Alternation(alternation) => &alternation.asts,
Ast::Group(group) => return alternation_has_overlapping_prefixes(&group.ast),
_ => return false,
};
let prefixes = alternatives
.iter()
.filter_map(literalish_prefix)
.collect::<Vec<_>>();
for (idx, prefix) in prefixes.iter().enumerate() {
for other in prefixes.iter().skip(idx + 1) {
if prefix.starts_with(other) || other.starts_with(prefix) {
return true;
}
}
}
false
}
fn literalish_prefix(ast: &Ast) -> Option<String> {
match ast {
Ast::Literal(literal) => Some(literal.c.to_string()),
Ast::Concat(concat) => {
let mut prefix = String::new();
for node in &concat.asts {
match node {
Ast::Literal(literal) => prefix.push(literal.c),
Ast::Group(group) => prefix.push_str(&literalish_prefix(&group.ast)?),
_ => break,
}
}
(!prefix.is_empty()).then_some(prefix)
}
Ast::Group(group) => literalish_prefix(&group.ast),
_ => None,
}
}
fn update_repeat_bound(kind: &ast::RepetitionKind, stats: &mut RegexComplexityStats) {
let bound = match kind {
ast::RepetitionKind::ZeroOrOne => 1,
ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => MAX_REGEX_REPEAT_BOUND,
ast::RepetitionKind::Range(range) => match range {
ast::RepetitionRange::Exactly(max)
| ast::RepetitionRange::AtLeast(max)
| ast::RepetitionRange::Bounded(_, max) => *max,
},
};
stats.max_repeat_bound = stats.max_repeat_bound.max(bound);
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Severity;
fn detector_with_pattern(regex: &str) -> DetectorSpec {
DetectorSpec {
id: "test-detector".into(),
name: "Test Detector".into(),
service: "test".into(),
severity: Severity::High,
keywords: vec!["token".into()],
patterns: vec![crate::PatternSpec {
regex: regex.into(),
description: None,
group: None,
}],
verify: None,
companions: Vec::new(),
}
}
#[test]
fn rejects_excessive_alternation_fanout() {
let regex = (0..65)
.map(|i| format!("opt{i}"))
.collect::<Vec<_>>()
.join("|");
let issues = validate_detector(&detector_with_pattern(®ex));
assert!(issues.iter().any(|issue| matches!(
issue,
QualityIssue::Error(message) if message.contains("alternation branches")
)));
}
#[test]
fn rejects_excessive_counted_repetition() {
let issues = validate_detector(&detector_with_pattern("token[a-z]{10001}"));
assert!(issues.iter().any(|issue| matches!(
issue,
QualityIssue::Error(message) if message.contains("counted repetition bound")
)));
}
#[test]
fn rejects_nested_quantifiers() {
let issues = validate_detector(&detector_with_pattern("(a+)+b"));
assert!(issues.iter().any(|issue| matches!(
issue,
QualityIssue::Error(message) if message.contains("nested quantifiers")
)));
}
#[test]
fn rejects_quantified_overlapping_alternation() {
let issues = validate_detector(&detector_with_pattern("(ab|a)+z"));
assert!(issues.iter().any(|issue| matches!(
issue,
QualityIssue::Error(message) if message.contains("overlapping alternations")
)));
}
#[test]
fn rejects_invalid_companion_regexes() {
let mut detector = detector_with_pattern("token_[A-Z0-9]{8}");
detector.companions.push(crate::CompanionSpec {
name: "secret".into(),
regex: "(".into(),
within_lines: 3,
required: false,
});
let issues = validate_detector(&detector);
assert!(issues.iter().any(|issue| matches!(
issue,
QualityIssue::Error(message)
if message.contains("companion 0 regex does not compile")
)));
}
#[test]
fn rejects_broad_companion_character_class() {
let mut detector = detector_with_pattern("token_[A-Z0-9]{8}");
detector.companions.push(crate::CompanionSpec {
name: "secret".into(),
regex: "[A-Za-z0-9+/=]{40,}".into(),
within_lines: 3,
required: false,
});
let issues = validate_detector(&detector);
assert!(issues.iter().any(|issue| matches!(
issue,
QualityIssue::Error(message) if message.contains("pure character class")
)));
}
}