use once_cell::sync::Lazy;
use regex::{Regex, RegexSet};
use serde::Deserialize;
use std::collections::HashMap;
use std::sync::RwLock;
const MAX_REGEX_PATTERN_LEN: usize = 4096;
const MAX_BODY_REGEX_PATTERNS: usize = 2000;
const BODY_ONLY_MIN_CONFIDENCE: f64 = 0.5;
const REGEX_COMPILE_SIZE_LIMIT: usize = wafrift_types::REGEX_NFA_SIZE_LIMIT;
fn compile_ci_regex(pattern: &str, kind: &str) -> Result<Regex, String> {
let has_outer_case_flag = pattern.starts_with("(?")
&& pattern[2..]
.split([':', ')'])
.next()
.is_some_and(|flags| flags.contains('i'));
let full = if has_outer_case_flag {
pattern.to_string()
} else {
format!("(?i){pattern}")
};
regex::RegexBuilder::new(&full)
.size_limit(REGEX_COMPILE_SIZE_LIMIT)
.build()
.map_err(|e| format!("bad {kind} regex '{pattern}': {e}"))
}
#[cfg(test)]
fn strip_outer_flag_group(src: &str) -> &str {
if !src.starts_with("(?") {
return src;
}
let bytes = src.as_bytes();
let mut i = 2;
while i < bytes.len() && bytes[i] != b')' {
if bytes[i] == b':' {
return src;
}
i += 1;
}
if i < bytes.len() { &src[i + 1..] } else { src }
}
fn clamped_snippet(s: &str, start: usize, max: usize) -> &str {
if start >= s.len() {
return "";
}
let mut lo = start;
while lo > 0 && !s.is_char_boundary(lo) {
lo -= 1;
}
let mut hi = lo.saturating_add(max).min(s.len());
while hi > lo && !s.is_char_boundary(hi) {
hi -= 1;
}
&s[lo..hi]
}
static RULE_DB: Lazy<RwLock<RuleEngine>> = Lazy::new(|| {
let engine = RuleEngine::load_embedded().unwrap_or_else(|e| {
tracing::warn!("Failed to load embedded WAF rules: {e}");
RuleEngine::default()
});
RwLock::new(engine)
});
#[derive(Debug, Default, Clone)]
pub struct RuleEngine {
pub rules: HashMap<String, CompiledWafRule>,
pub names: Vec<String>,
body_regex_set: Option<RegexSet>,
body_pattern_map: Vec<BodyPatternRef>,
body_regexes: Vec<Regex>,
}
#[derive(Debug, Clone)]
struct BodyPatternRef {
waf_name: String,
weight: f64,
}
#[derive(Debug, Clone)]
pub struct CompiledWafRule {
pub name: String,
pub vendor: String,
pub confidence_threshold: f64,
pub evasions: Vec<String>,
pub source: String,
pub signatures: Vec<CompiledSignature>,
}
#[derive(Debug, Clone)]
pub struct CompiledSignature {
pub header_name: Option<String>,
pub header_regex: Option<Regex>,
pub cookie_regex: Option<Regex>,
pub body_regex: Option<Regex>,
pub status_code: Option<u16>,
pub weight: f64,
}
#[derive(Debug, Clone, Deserialize)]
struct RawRuleDb {
#[serde(default)]
waf: Vec<RawWafRule>,
}
#[derive(Debug, Clone, Deserialize)]
struct RawWafRule {
name: String,
vendor: String,
#[serde(default = "default_threshold")]
confidence_threshold: f64,
#[serde(default)]
evasions: Vec<String>,
#[serde(default)]
source: String,
#[serde(default)]
signature: Vec<RawSignature>,
}
#[derive(Debug, Clone, Deserialize)]
struct RawSignature {
header_name: Option<String>,
header_regex: Option<String>,
cookie_regex: Option<String>,
body_regex: Option<String>,
status_code: Option<u16>,
#[serde(default = "default_weight")]
weight: f64,
}
fn default_threshold() -> f64 {
0.3
}
fn default_weight() -> f64 {
0.4
}
const EMBEDDED_RULES_TOML: &str =
include_str!(concat!(env!("OUT_DIR"), "/embedded_detect_rules.toml"));
impl RuleEngine {
pub fn load_embedded() -> Result<Self, DetectRulesError> {
let mut engine = RuleEngine {
rules: HashMap::new(),
names: Vec::new(),
body_regex_set: None,
body_pattern_map: Vec::new(),
body_regexes: Vec::new(),
};
let embedded_ok =
engine.load_from_str(EMBEDDED_RULES_TOML).is_ok() && !engine.rules.is_empty();
if !embedded_ok {
let candidates = [
std::path::PathBuf::from("rules/detect"),
std::path::PathBuf::from("../rules/detect"),
std::path::PathBuf::from("../../rules/detect"),
];
let mut loaded = false;
for dir in &candidates {
if dir.is_dir() {
engine.load_directory(dir)?;
loaded = true;
break;
}
}
if !loaded {
return Err(DetectRulesError::Io(std::io::Error::new(
std::io::ErrorKind::NotFound,
"rules/detect directory not found and no embedded rules available",
)));
}
}
engine.compile_body_regex_set()?;
Ok(engine)
}
pub fn load_from_str(&mut self, toml_content: &str) -> Result<(), DetectRulesError> {
let raw: RawRuleDb = toml::from_str(toml_content)
.map_err(|e| DetectRulesError::Parse(format!("embedded rules: {e}")))?;
for waf in raw.waf {
let compiled = Self::compile_waf(waf)
.map_err(|e| DetectRulesError::Parse(format!("embedded rules: {e}")))?;
let key = compiled.name.clone();
if !self.rules.contains_key(&key) {
self.names.push(key.clone());
}
self.rules.insert(key, compiled);
}
Ok(())
}
pub fn load_directory(&mut self, path: &std::path::Path) -> Result<(), DetectRulesError> {
for (entry, content) in wafrift_types::loaders::read_toml_files_strict(path)? {
let raw: RawRuleDb = toml::from_str(&content)
.map_err(|e| DetectRulesError::Parse(format!("{}: {e}", entry.display())))?;
for waf in raw.waf {
let compiled = Self::compile_waf(waf)
.map_err(|e| DetectRulesError::Parse(format!("{}: {e}", entry.display())))?;
let key = compiled.name.clone();
if !self.rules.contains_key(&key) {
self.names.push(key.clone());
}
self.rules.insert(key, compiled);
}
}
Ok(())
}
fn compile_waf(raw: RawWafRule) -> Result<CompiledWafRule, String> {
let mut signatures = Vec::with_capacity(raw.signature.len());
for sig in raw.signature {
let header_regex = sig
.header_regex
.as_ref()
.filter(|p| {
if p.len() > MAX_REGEX_PATTERN_LEN {
tracing::warn!(
waf = %raw.name,
pattern_len = p.len(),
max = MAX_REGEX_PATTERN_LEN,
"skipping oversized header regex"
);
false
} else {
true
}
})
.map(|p| compile_ci_regex(p, "header"))
.transpose()?;
let cookie_regex = sig
.cookie_regex
.as_ref()
.filter(|p| {
if p.len() > MAX_REGEX_PATTERN_LEN {
tracing::warn!(
waf = %raw.name,
pattern_len = p.len(),
max = MAX_REGEX_PATTERN_LEN,
"skipping oversized cookie regex"
);
false
} else {
true
}
})
.map(|p| compile_ci_regex(p, "cookie"))
.transpose()?;
let body_regex = sig
.body_regex
.as_ref()
.filter(|p| {
if p.len() > MAX_REGEX_PATTERN_LEN {
tracing::warn!(
waf = %raw.name,
pattern_len = p.len(),
max = MAX_REGEX_PATTERN_LEN,
"skipping oversized body regex"
);
false
} else {
true
}
})
.map(|p| compile_ci_regex(p, "body"))
.transpose()?;
signatures.push(CompiledSignature {
header_name: sig.header_name.map(|s| s.to_ascii_lowercase()),
header_regex,
cookie_regex,
body_regex,
status_code: sig.status_code,
weight: sig.weight,
});
}
Ok(CompiledWafRule {
name: raw.name,
vendor: raw.vendor,
confidence_threshold: raw.confidence_threshold,
evasions: raw.evasions,
source: raw.source,
signatures,
})
}
pub fn compile_body_regex_set(&mut self) -> Result<(), DetectRulesError> {
let mut patterns: Vec<String> = Vec::new();
let mut map: Vec<BodyPatternRef> = Vec::new();
let mut regexes: Vec<Regex> = Vec::new();
for name in &self.names {
let rule = &self.rules[name];
for sig in &rule.signatures {
if let Some(ref re) = sig.body_regex {
if patterns.len() >= MAX_BODY_REGEX_PATTERNS {
tracing::warn!(
limit = MAX_BODY_REGEX_PATTERNS,
waf_truncation_started_at = %name,
"body regex set hit cap; signatures for this WAF \
and every WAF after it in iteration order will \
NOT match on body text. Consider raising \
MAX_BODY_REGEX_PATTERNS or pruning low-weight \
rules."
);
break;
}
patterns.push(re.as_str().to_string());
map.push(BodyPatternRef {
waf_name: name.clone(),
weight: sig.weight,
});
regexes.push(re.clone());
}
}
if patterns.len() >= MAX_BODY_REGEX_PATTERNS {
break;
}
}
if !patterns.is_empty() {
let set = regex::RegexSetBuilder::new(&patterns)
.size_limit(REGEX_COMPILE_SIZE_LIMIT)
.build()
.map_err(|e| {
DetectRulesError::Parse(format!("failed to compile body RegexSet: {e}"))
})?;
self.body_regex_set = Some(set);
}
self.body_pattern_map = map;
self.body_regexes = regexes;
Ok(())
}
pub fn detect(
&self,
status: u16,
headers: &[(String, String)],
body: &str,
) -> Vec<DetectedWaf> {
let body_hits: Vec<usize> = self
.body_regex_set
.as_ref()
.map(|set| set.matches(body).into_iter().collect())
.unwrap_or_default();
let mut waf_scores: HashMap<&str, (f64, Vec<String>)> = HashMap::new();
for &pattern_idx in &body_hits {
let pref = &self.body_pattern_map[pattern_idx];
let entry = waf_scores
.entry(&pref.waf_name)
.or_insert_with(|| (0.0, Vec::new()));
entry.0 += pref.weight;
if let Some(m) = self.body_regexes[pattern_idx].find(body) {
let snippet = clamped_snippet(body, m.start(), 40);
entry.1.push(format!("body: {snippet}"));
}
}
for name in &self.names {
let rule = &self.rules[name];
for sig in &rule.signatures {
if sig.header_regex.is_none()
&& sig.cookie_regex.is_none()
&& sig.status_code.is_none()
{
continue;
}
let mut matched = false;
let entry = waf_scores.entry(name).or_insert_with(|| (0.0, Vec::new()));
if let Some(expected) = sig.status_code
&& status == expected
{
matched = true;
entry.1.push(format!("status: {status}"));
}
if let Some(ref re) = sig.header_regex {
let hname = sig.header_name.as_deref().unwrap_or("");
for (k, v) in headers {
if (hname.is_empty() || k.eq_ignore_ascii_case(hname))
&& let Some(m) = re.find(v)
{
matched = true;
entry
.1
.push(format!("header {k}: {}", clamped_snippet(v, m.start(), 40)));
break;
}
}
}
if let Some(ref re) = sig.cookie_regex {
for (k, v) in headers {
if k.eq_ignore_ascii_case("set-cookie") && re.is_match(v) {
matched = true;
entry.1.push(format!("cookie: {k}"));
break;
}
}
}
if matched {
entry.0 += sig.weight;
}
}
}
let mut results: Vec<DetectedWaf> = waf_scores
.into_iter()
.filter_map(|(name, (score, indicators))| {
let rule = &self.rules[name];
let has_non_body_indicator = indicators
.iter()
.any(|indicator| !indicator.starts_with("body: "));
let effective_threshold = if has_non_body_indicator {
rule.confidence_threshold
} else {
rule.confidence_threshold.max(BODY_ONLY_MIN_CONFIDENCE)
};
if score >= effective_threshold {
Some(DetectedWaf {
name: name.to_string(),
confidence: score.min(1.0),
indicators,
})
} else {
None
}
})
.collect();
results.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.name.cmp(&b.name))
});
results
}
#[must_use]
pub fn evasions_for(&self, name: &str) -> Vec<&str> {
self.rules
.get(name)
.map(|r| r.evasions.iter().map(String::as_str).collect())
.unwrap_or_default()
}
#[must_use]
pub fn len(&self) -> usize {
self.rules.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.rules.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct DetectedWaf {
pub name: String,
pub confidence: f64,
pub indicators: Vec<String>,
}
#[derive(Debug, thiserror::Error)]
pub enum DetectRulesError {
#[error("io error: {0}")]
Io(#[from] std::io::Error),
#[error("parse error: {0}")]
Parse(String),
}
pub fn with_engine<F, R>(f: F) -> R
where
F: FnOnce(&RuleEngine) -> R,
{
let guard = RULE_DB
.read()
.unwrap_or_else(std::sync::PoisonError::into_inner);
f(&guard)
}
pub fn reload() -> Result<(), DetectRulesError> {
let new_engine = RuleEngine::load_embedded()?;
let mut guard = RULE_DB
.write()
.map_err(|e| DetectRulesError::Parse(format!("RULE_DB poisoned: {e}")))?;
*guard = new_engine;
Ok(())
}
#[must_use]
pub fn detect(status: u16, headers: &[(String, String)], body: &str) -> Vec<DetectedWaf> {
with_engine(|engine| engine.detect(status, headers, body))
}
#[must_use]
pub fn supported_wafs() -> Vec<String> {
with_engine(|engine| engine.names.clone())
}
#[must_use]
pub fn suggest_evasion(waf_name: &str) -> Vec<String> {
with_engine(|engine| {
engine.rules.get(waf_name).map_or_else(
|| {
vec![
"CaseAlternation".into(),
"SqlCommentInsertion".into(),
"DoubleUrlEncode".into(),
"ContentTypeSwitch".into(),
]
},
|r| r.evasions.clone(),
)
})
}
#[derive(Debug, Clone, Copy)]
pub struct DetectConfig {
pub threshold: f64,
pub ambiguity_delta: f64,
}
impl Default for DetectConfig {
fn default() -> Self {
Self {
threshold: 0.3,
ambiguity_delta: 0.15,
}
}
}
#[must_use]
pub fn detect_with_config(
status: u16,
headers: &[(String, String)],
body: &str,
config: DetectConfig,
) -> Vec<DetectedWaf> {
let mut results = detect(status, headers, body);
results.retain(|r| r.confidence >= config.threshold);
if results.len() >= 2 {
let delta = results[0].confidence - results[1].confidence;
if delta < config.ambiguity_delta {
let mut keep = 2;
for window in results.windows(2) {
if window[0].confidence - window[1].confidence < config.ambiguity_delta {
keep += 1;
} else {
break;
}
}
results.truncate(keep);
} else {
results.truncate(1);
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_TOML: &str = r#"
[[waf]]
name = "TestWAF"
vendor = "test"
confidence_threshold = 0.3
evasions = ["CaseAlternation", "SqlCommentInsertion"]
[[waf.signature]]
header_name = "x-test-waf"
header_regex = "active"
weight = 0.9
[[waf.signature]]
body_regex = "blocked by test"
weight = 0.95
[[waf.signature]]
status_code = 403
weight = 0.5
[[waf]]
name = "AnotherWAF"
vendor = "another"
confidence_threshold = 0.5
evasions = ["DoubleUrlEncode"]
[[waf.signature]]
body_regex = "another waf"
weight = 0.6
"#;
fn test_engine() -> RuleEngine {
let mut engine = RuleEngine::default();
engine.load_from_str(TEST_TOML).expect("load test toml");
engine.compile_body_regex_set().expect("compile regex set");
engine
}
#[test]
fn load_from_str_populates_rules() {
let engine = test_engine();
assert_eq!(engine.len(), 2);
assert!(!engine.is_empty());
}
#[test]
fn detect_by_header() {
let engine = test_engine();
let headers = vec![("x-test-waf".into(), "active".into())];
let results = engine.detect(200, &headers, "OK");
assert_eq!(results.len(), 1);
assert_eq!(results[0].name, "TestWAF");
assert!(results[0].confidence >= 0.9);
}
#[test]
fn detect_by_body() {
let engine = test_engine();
let headers: Vec<(String, String)> = vec![];
let results = engine.detect(200, &headers, "you are blocked by test engine");
assert_eq!(results.len(), 1);
assert_eq!(results[0].name, "TestWAF");
assert!(results[0].confidence >= 0.95);
}
#[test]
fn detect_by_status() {
let engine = test_engine();
let headers: Vec<(String, String)> = vec![];
let results = engine.detect(403, &headers, "");
assert_eq!(results.len(), 1);
assert_eq!(results[0].name, "TestWAF");
}
#[test]
fn detect_no_match() {
let engine = test_engine();
let headers = vec![("server".into(), "nginx".into())];
let results = engine.detect(200, &headers, "Welcome");
assert!(results.is_empty());
}
#[test]
fn detect_confidence_threshold_filters_body_only() {
let engine = test_engine();
let results = engine.detect(200, &[], "another waf detected");
assert_eq!(results.len(), 1);
assert_eq!(results[0].name, "AnotherWAF");
}
#[test]
fn evasions_for_known_waf() {
let engine = test_engine();
let evasions = engine.evasions_for("TestWAF");
assert_eq!(evasions.len(), 2);
assert!(evasions.contains(&"CaseAlternation"));
}
#[test]
fn evasions_for_unknown_waf_empty() {
let engine = test_engine();
assert!(engine.evasions_for("Unknown").is_empty());
}
#[test]
fn detect_body_only_needs_higher_threshold() {
let mut engine = RuleEngine::default();
engine
.load_from_str(
r#"
[[waf]]
name = "LowConfWAF"
vendor = "test"
confidence_threshold = 0.1
[[waf.signature]]
body_regex = "blocked"
weight = 0.4
"#,
)
.expect("load");
engine.compile_body_regex_set().expect("compile");
let results = engine.detect(200, &[], "blocked");
assert!(results.is_empty());
}
#[test]
fn empty_engine_returns_empty() {
let engine = RuleEngine::default();
assert!(engine.is_empty());
assert_eq!(engine.len(), 0);
let results = engine.detect(200, &[], "body");
assert!(results.is_empty());
}
#[test]
fn detect_sorts_by_confidence_desc() {
let engine = test_engine();
let headers = vec![("x-test-waf".into(), "active".into())];
let results = engine.detect(200, &headers, "blocked by test and another waf");
assert!(!results.is_empty());
assert_eq!(results[0].name, "TestWAF");
}
#[test]
fn ci_wrapper_matches_capitalized_literal_against_lowercase_input() {
let re = compile_ci_regex("Cloudflare", "header").expect("compile");
assert!(re.is_match("cloudflare"));
assert!(re.is_match("CLOUDFLARE"));
assert!(re.is_match("CloudFlare"));
assert!(re.is_match("cLoUdFlArE"));
}
#[test]
fn ci_wrapper_makes_uppercase_char_class_match_lowercase_input() {
let re = compile_ci_regex("cache-[a-z]{3}[0-9]+-[A-Z]{3}", "header").expect("compile");
assert!(re.is_match("cache-lga21972-lga"));
assert!(re.is_match("cache-lga21972-LGA"));
assert!(re.is_match("cache-lga21972-LGA, cache-bur-kbur8200085-BUR"));
assert!(!re.is_match("cache-2-LGA"));
assert!(!re.is_match("cache-lga--LGA"));
}
#[test]
fn ci_wrapper_preserves_existing_outer_ci_flag_idempotently() {
let re = compile_ci_regex("(?i)Already", "header").expect("compile");
assert!(re.is_match("ALREADY"));
assert!(re.is_match("already"));
}
#[test]
fn ci_wrapper_respects_explicit_case_sensitive_opt_out() {
let re = compile_ci_regex("(?-i)Strict", "header").expect("compile");
assert!(re.is_match("Strict"));
assert!(!re.is_match("strict"));
assert!(!re.is_match("STRICT"));
}
#[test]
fn ci_wrapper_handles_combined_flag_groups() {
let re = compile_ci_regex("(?im)^TOKEN", "body").expect("compile");
assert!(re.is_match("first\ntoken"));
let re_opt_out = compile_ci_regex("(?-im)^Strict", "body").expect("compile");
assert!(re_opt_out.is_match("Strict line"));
assert!(!re_opt_out.is_match("strict line"));
}
#[test]
fn ci_wrapper_does_not_double_wrap_when_outer_flag_present() {
let already = compile_ci_regex("(?i)foo", "header").expect("compile");
let plain = compile_ci_regex("foo", "header").expect("compile");
for s in ["foo", "FOO", "Foo", "FoO"] {
assert_eq!(already.is_match(s), plain.is_match(s));
}
}
#[test]
fn ci_wrapper_compiles_anchored_patterns_without_breaking_anchors() {
let re = compile_ci_regex("^Cloudflare$", "header").expect("compile");
assert!(re.is_match("CLOUDFLARE"));
assert!(!re.is_match("foo Cloudflare bar"));
assert!(!re.is_match("Cloudflare extra"));
}
#[test]
fn ci_wrapper_compiles_patterns_with_escaped_metacharacters() {
let re = compile_ci_regex("(?:F5\\-TrafficShield)", "header").expect("compile");
assert!(re.is_match("f5-trafficshield"));
assert!(re.is_match("F5-TrafficShield"));
assert!(!re.is_match("F5TrafficShield"));
}
#[test]
fn ci_wrapper_respects_multi_letter_flag_groups() {
let re = compile_ci_regex("(?si)Cloudflare", "header").expect("compile");
assert!(re.is_match("CLOUDFLARE"));
let re = compile_ci_regex("(?-si)Cloudflare", "header").expect("compile");
assert!(re.is_match("Cloudflare"));
assert!(
!re.is_match("cloudflare"),
"(?-si) author intent: case-sensitive — must not match lowercase"
);
let re = compile_ci_regex("(?:Imperva)", "header").expect("compile");
assert!(re.is_match("IMPERVA"));
assert!(re.is_match("imperva"));
}
#[test]
fn ci_wrapper_compiles_patterns_with_unicode_metaclasses() {
let re = compile_ci_regex("token-\\w+", "header").expect("compile");
assert!(re.is_match("TOKEN-abc123"));
assert!(re.is_match("token-Xyz_99"));
}
#[test]
fn ci_wrapper_compiles_empty_alternation_and_zero_width_safely() {
let re = compile_ci_regex("(?:foo|bar)", "header").expect("compile");
assert!(re.is_match("FOO"));
assert!(re.is_match("Bar"));
assert!(!re.is_match("baz"));
}
#[test]
fn ci_wrapper_rejects_pattern_that_was_already_broken() {
let err = compile_ci_regex("([unclosed", "header");
assert!(err.is_err(), "broken pattern must surface as Err");
let msg = err.unwrap_err();
assert!(
msg.contains("header"),
"error message must name the regex kind: {msg}"
);
assert!(
msg.contains("[unclosed"),
"error message must echo the offending pattern: {msg}"
);
}
#[test]
fn every_embedded_rule_compiles() {
let engine = RuleEngine::load_embedded().expect("all embedded rules compile");
assert!(engine.len() >= 50, "catalog shrank: {}", engine.len());
}
#[test]
fn every_header_regex_in_catalog_is_case_insensitive() {
let engine = RuleEngine::load_embedded().expect("load");
let mut checked = 0;
for rule in engine.rules.values() {
for sig in &rule.signatures {
if let Some(ref re) = sig.header_regex {
let src = re.as_str();
if src.starts_with("(?-i)") || src.starts_with("(?-i-") {
continue;
}
assert!(
src.starts_with("(?i)") || src.starts_with("(?i-")
|| src.starts_with("(?im")
|| src.starts_with("(?is")
|| src.starts_with("(?ix")
|| src.starts_with("(?iu")
|| src.contains("(?i)")
|| src.contains("(?i:"),
"header regex `{src}` in rule `{}` is NOT case-insensitive — that's the lower-cased-value bug class waiting to happen",
rule.name
);
checked += 1;
}
}
}
assert!(
checked >= 30,
"expected many CI-wrapped header rules, got {checked}"
);
}
#[test]
fn lowercase_input_must_match_uppercase_pattern_for_every_rule() {
let engine = RuleEngine::load_embedded().expect("load");
let mut tested = 0;
let mut not_applicable = 0;
for rule in engine.rules.values() {
for sig in &rule.signatures {
let Some(ref re) = sig.header_regex else {
continue;
};
let src = re.as_str();
if src.starts_with("(?-i)") {
continue;
}
let literal: String = src
.chars()
.filter(|c| c.is_ascii_alphanumeric() || *c == ' ' || *c == '-')
.collect();
let lowered = literal.to_ascii_lowercase();
if lowered.trim().is_empty() {
not_applicable += 1;
continue;
}
if re.is_match(&lowered) {
tested += 1;
}
}
}
assert!(
tested >= 20,
"lowercase round-trip succeeded for only {tested} rules ({not_applicable} skipped) — CI wrapper likely broken"
);
}
#[test]
fn csv_joined_multi_hop_header_value_still_matches_anchored_pattern() {
use crate::waf_detect::classifier;
let headers = vec![(
"X-Served-By".into(),
"cache-aaa12345-AAA, cache-bbb67890-BBB, cache-ccc-with-hyphens-CCC".into(),
)];
let detected = classifier::detect(200, &headers, b"");
assert!(
!detected.is_empty(),
"CSV multi-hop cache header must produce at least one detection"
);
}
#[test]
fn every_literal_header_rule_in_catalog_matches_capitalized_value() {
use crate::waf_detect::classifier;
let engine = RuleEngine::load_embedded().expect("load");
let mut tested = 0;
let mut missed: Vec<(String, String, String)> = Vec::new();
for rule in engine.rules.values() {
for sig in &rule.signatures {
let (Some(name), Some(re)) = (sig.header_name.as_ref(), sig.header_regex.as_ref())
else {
continue;
};
let src = re.as_str();
let literal = strip_outer_flag_group(src);
if literal.is_empty()
|| !literal.chars().all(|c| {
c.is_ascii_alphanumeric() || matches!(c, ' ' | '-' | '_' | '.' | '/')
})
{
continue;
}
let value = literal.to_string();
let detected = classifier::detect(200, &[(name.clone(), value.clone())], b"");
if detected.iter().any(|r| r.name == rule.name) {
tested += 1;
} else {
missed.push((rule.name.clone(), name.clone(), value));
}
}
}
assert!(
tested >= 20,
"expected >=20 literal-pattern catalog rules to fire under CI; got {tested}. Misses: {missed:?}"
);
assert!(
missed.is_empty(),
"rules whose own literal value did NOT fire through the public API (CI wrapper broken): {missed:?}"
);
}
#[test]
fn mixed_case_header_name_with_known_lowercase_signature_still_matches() {
use crate::waf_detect::classifier;
let engine = RuleEngine::load_embedded().expect("load");
let mut sampled = 0;
for rule in engine.rules.values() {
for sig in &rule.signatures {
let (Some(name), Some(re)) = (sig.header_name.as_ref(), sig.header_regex.as_ref())
else {
continue;
};
let literal = strip_outer_flag_group(re.as_str());
if literal.is_empty()
|| !literal
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == ' ' || c == '-' || c == '_')
{
continue;
}
let title_name: String = name
.split('-')
.map(|part| {
let mut chars = part.chars();
match chars.next() {
None => String::new(),
Some(c) => c.to_ascii_uppercase().to_string() + chars.as_str(),
}
})
.collect::<Vec<_>>()
.join("-");
let value: String = literal.to_string();
let detected = classifier::detect(200, &[(title_name, value)], b"");
if detected.iter().any(|r| r.name == rule.name) {
sampled += 1;
}
if sampled >= 5 {
return;
}
}
}
assert!(
sampled >= 5,
"mixed-case header-name match should work for at least 5 catalog rules; got {sampled}"
);
}
#[test]
fn multi_waf_chain_returns_every_layer_not_just_the_top() {
use crate::waf_detect::classifier;
let headers = vec![
("Server".into(), "envoy".into()),
("X-Envoy-Upstream-Service-Time".into(), "120".into()),
("X-Served-By".into(), "cache-aaa11111-AAA".into()),
("X-Timer".into(), "S1234567890.000,VS0,VE5".into()),
];
let detected = classifier::detect(200, &headers, b"");
assert!(
detected.len() >= 2,
"multi-WAF chain must surface every layer. Got only: {detected:?}"
);
}
#[test]
fn unknown_vendor_banner_does_not_false_positive() {
use crate::waf_detect::classifier;
let detected = classifier::detect(
200,
&[("Server".into(), "totally-fake-vendor-xyz-123".into())],
b"",
);
assert!(
detected.is_empty(),
"garbage vendor must not match anything: got {detected:?}"
);
}
#[test]
fn body_regex_with_capitalized_literal_matches_lowercased_body() {
let mut engine = RuleEngine::default();
engine
.load_from_str(
r#"
[[waf]]
name = "BodyCaseWAF"
vendor = "test"
confidence_threshold = 0.3
[[waf.signature]]
body_regex = "BLOCKED BY THIS WAF"
weight = 0.6
"#,
)
.expect("load");
engine.compile_body_regex_set().expect("compile");
let detected = engine.detect(200, &[], "you have been blocked by this waf");
assert!(
detected.iter().any(|r| r.name == "BodyCaseWAF"),
"body regex with capitalized literal must match lowercased body. Got: {detected:?}"
);
}
#[test]
fn cookie_regex_with_capitalized_literal_matches_lowercased_value() {
let mut engine = RuleEngine::default();
engine
.load_from_str(
r#"
[[waf]]
name = "CookieCaseWAF"
vendor = "test"
confidence_threshold = 0.3
[[waf.signature]]
cookie_regex = "VISITOR_SESSION"
weight = 0.6
"#,
)
.expect("load");
engine.compile_body_regex_set().expect("compile");
let headers = vec![("set-cookie".into(), "visitor_session=abc; Path=/".into())];
let detected = engine.detect(200, &headers, "");
assert!(
detected.iter().any(|r| r.name == "CookieCaseWAF"),
"cookie regex with capitalized literal must match lowercased Set-Cookie value. Got: {detected:?}"
);
}
#[test]
fn repeated_header_values_in_chain_both_get_scanned() {
use crate::waf_detect::classifier;
let detected = classifier::detect(
200,
&[
("X-Served-By".into(), "cache-aaa11111-AAA".into()),
("X-Served-By".into(), "cache-bbb22222-BBB".into()),
("X-Served-By".into(), "cache-ccc33333-CCC".into()),
],
b"",
);
assert!(
!detected.is_empty(),
"repeated header values must each be eligible for matching"
);
}
#[test]
fn header_value_with_non_ascii_bytes_does_not_panic() {
use crate::waf_detect::classifier;
let detected = classifier::detect(
200,
&[
("Server".into(), "Cloudflåre — €dge".into()),
("X-Block-Reason".into(), "→ denied".into()),
],
b"blocked by \xe2\x86\x92 firewall",
);
let _ = detected;
}
#[test]
fn empty_inputs_never_panic_or_false_positive() {
use crate::waf_detect::classifier;
for (status, headers, body) in [
(200, vec![], &b""[..]),
(0, vec![], &b""[..]),
(599, vec![("".into(), "".into())], &b""[..]),
(404, vec![("X-Empty".into(), "".into())], &b""[..]),
] {
let detected = classifier::detect(status, &headers, body);
assert!(
detected.is_empty() || !detected[0].name.is_empty(),
"empty input must not false-positive: {detected:?}"
);
}
}
#[test]
fn extremely_long_header_value_does_not_panic_or_hang() {
use crate::waf_detect::classifier;
let value = "a".repeat(100 * 1024);
let detected = classifier::detect(200, &[("X-Junk".into(), value)], b"");
let _ = detected;
}
#[test]
fn detection_is_stable_under_random_header_casing() {
use crate::waf_detect::classifier;
let canonical = vec![
("Server".to_string(), "AkamaiGHost".to_string()),
("X-Akam-SW-Version".to_string(), "12.5".to_string()),
];
let scrambled = vec![
("sErVeR".to_string(), "AKamaIghOSt".to_string()),
("X-aKam-sw-VeRsIoN".to_string(), "12.5".to_string()),
];
let a = classifier::detect(200, &canonical, b"");
let b = classifier::detect(200, &scrambled, b"");
let names_a: Vec<_> = a.iter().map(|r| r.name.clone()).collect();
let names_b: Vec<_> = b.iter().map(|r| r.name.clone()).collect();
assert_eq!(
names_a, names_b,
"case randomization changed detection result"
);
}
#[test]
fn redos_explosion_pattern_is_rejected_not_hung() {
let pat = r"(.{1,100}){50}";
let result = compile_ci_regex(pat, "header");
assert!(
result.is_err(),
"NFA-explosion pattern `{pat}` must be rejected by size_limit"
);
let msg = result.unwrap_err();
assert!(
msg.contains("header"),
"error must name the regex kind; got: {msg}"
);
}
#[test]
fn redos_explosion_in_rule_file_is_rejected_with_error() {
let toml = r#"
[[waf]]
name = "ExplosionWAF"
vendor = "test"
confidence_threshold = 0.3
[[waf.signature]]
header_regex = "(.{1,100}){50}"
weight = 0.5
"#;
let mut engine = RuleEngine::default();
let result = engine.load_from_str(toml);
assert!(
result.is_err(),
"NFA-explosion header_regex must surface as load error"
);
}
#[test]
fn redos_explosion_in_body_regex_set_is_rejected_with_error() {
let toml = r#"
[[waf]]
name = "BodyExplosionWAF"
vendor = "test"
confidence_threshold = 0.3
[[waf.signature]]
body_regex = "(.{1,100}){50}"
weight = 0.5
"#;
let mut engine = RuleEngine::default();
let load_result = engine.load_from_str(toml);
if load_result.is_ok() {
let set_result = engine.compile_body_regex_set();
assert!(
set_result.is_err(),
"NFA-explosion body_regex must surface as error from compile_body_regex_set"
);
}
assert!(
load_result.is_err(),
"NFA-explosion body_regex must be caught by per-rule compile step"
);
}
#[test]
fn benign_patterns_still_compile_after_size_limit_applied() {
let patterns = [
"cloudflare",
r"cache-[a-z]{3}[0-9]+-[A-Z]{3}",
r"X-Sucuri-ID",
r"(\d{1,3}\.){3}\d{1,3}",
r"(?i)blocked by",
r"(?-i)BinarySec",
];
for pat in &patterns {
let result = compile_ci_regex(pat, "header");
assert!(
result.is_ok(),
"benign pattern `{pat}` must still compile after size_limit: {:?}",
result.err()
);
}
}
}