use crate::ensemble_dilution::RuleGroup;
use crate::error::{Result, WafModelError};
use crate::oracle::WafOracle;
use crate::outcome::Outcome;
use crate::solve::preimage_for;
use crate::transduce::{Pipeline, Stage};
use wafrift_types::Request;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenProbe {
pub token: String,
pub benign_twin: String,
pub class: RuleGroup,
}
impl TokenProbe {
pub fn new(token: impl Into<String>, benign_twin: impl Into<String>, class: RuleGroup) -> Self {
Self {
token: token.into(),
benign_twin: benign_twin.into(),
class,
}
}
fn validate(&self) -> std::result::Result<(), String> {
if self.token.len() != self.benign_twin.len() {
return Err(format!(
"twin {:?} must match token {:?} byte length",
self.benign_twin, self.token
));
}
if self.token == self.benign_twin {
return Err(format!("twin must differ from token {:?}", self.token));
}
for (i, (tb, wb)) in self.token.bytes().zip(self.benign_twin.bytes()).enumerate() {
if tb != wb && !(tb.is_ascii_alphabetic() && wb.is_ascii_alphabetic()) {
return Err(format!(
"at index {i}, twin {:?} differs from {:?} on a non-letter byte — that \
perturbs the structural skeleton",
self.benign_twin, self.token
));
}
}
Ok(())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Verdict {
Policed,
Unpoliced,
CarrierGate,
Inconclusive,
}
impl Verdict {
#[must_use]
pub fn from_outcomes(dangerous: Outcome, twin: Outcome) -> Self {
match (dangerous, twin) {
(Outcome::Block, Outcome::Pass) => Verdict::Policed,
(Outcome::Pass, Outcome::Pass) => Verdict::Unpoliced,
(Outcome::Block, Outcome::Block) => Verdict::CarrierGate,
(Outcome::Pass, Outcome::Block) => Verdict::Inconclusive,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenFinding {
pub token: String,
pub class: RuleGroup,
pub verdict: Verdict,
}
#[derive(Debug, Clone, Default)]
pub struct FilterProfile {
pub findings: Vec<TokenFinding>,
pub queries: u64,
pub transport_errors: u64,
}
impl FilterProfile {
pub fn policed(&self) -> impl Iterator<Item = &TokenFinding> {
self.findings
.iter()
.filter(|f| f.verdict == Verdict::Policed)
}
pub fn unpoliced(&self) -> impl Iterator<Item = &TokenFinding> {
self.findings
.iter()
.filter(|f| f.verdict == Verdict::Unpoliced)
}
pub fn carrier_gated(&self) -> impl Iterator<Item = &TokenFinding> {
self.findings
.iter()
.filter(|f| f.verdict == Verdict::CarrierGate)
}
#[must_use]
pub fn is_policed(&self, token: &str) -> bool {
self.findings
.iter()
.any(|f| f.token == token && f.verdict == Verdict::Policed)
}
}
pub fn characterize<O, F>(
oracle: &mut O,
battery: &[TokenProbe],
carrier: F,
) -> Result<FilterProfile>
where
O: WafOracle,
F: Fn(&str) -> Request,
{
let mut findings = Vec::with_capacity(battery.len());
let mut transport_errors = 0u64;
let before = oracle.queries();
for probe in battery {
let dangerous = oracle.classify(&carrier(&probe.token));
let twin = oracle.classify(&carrier(&probe.benign_twin));
let verdict = match (dangerous, twin) {
(Ok(d), Ok(t)) => Verdict::from_outcomes(d, t),
_ => {
transport_errors += 1;
Verdict::Inconclusive
}
};
findings.push(TokenFinding {
token: probe.token.clone(),
class: probe.class,
verdict,
});
}
Ok(FilterProfile {
findings,
queries: oracle.queries().saturating_sub(before),
transport_errors,
})
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DecodeGap {
pub token: String,
pub stage: &'static str,
pub encoded_preimage: Vec<u8>,
}
fn decode_probe_stages() -> Vec<(&'static str, Stage)> {
vec![
(
"url_decode",
Stage::UrlDecode {
plus_is_space: false,
},
),
("double_url_decode", Stage::DoubleUrlDecode),
("html_entity_decode", Stage::HtmlEntityDecode),
("nfkc_normalize", Stage::NfkcNormalize),
("bestfit_downconvert", Stage::BestFitDownconvert),
("base64_decode", Stage::Base64Decode),
("hex_decode", Stage::HexDecode),
]
}
pub fn probe_decode_gaps<O, F>(
oracle: &mut O,
profile: &FilterProfile,
carrier: F,
) -> Result<Vec<DecodeGap>>
where
O: WafOracle,
F: Fn(&str) -> Request,
{
let stages = decode_probe_stages();
let mut gaps = Vec::new();
for finding in profile.policed() {
for (label, stage) in &stages {
let sink = Pipeline(vec![stage.clone()]);
let encoded = preimage_for(finding.token.as_bytes(), &sink, true);
if encoded == finding.token.as_bytes() {
continue; }
let value = String::from_utf8_lossy(&encoded).into_owned();
if matches!(oracle.classify(&carrier(&value))?, Outcome::Pass) {
gaps.push(DecodeGap {
token: finding.token.clone(),
stage: label,
encoded_preimage: encoded,
});
}
}
}
Ok(gaps)
}
const DEFAULT_BATTERY_TOML: &str = include_str!("../rules/filter/tokens.toml");
#[must_use]
pub fn default_battery() -> Vec<TokenProbe> {
battery_from_toml(DEFAULT_BATTERY_TOML)
.expect("embedded default filter battery must be valid (asserted in tests)")
}
pub fn battery_from_toml(src: &str) -> Result<Vec<TokenProbe>> {
#[derive(serde::Deserialize)]
struct ProbeFile {
#[serde(default)]
probe: Vec<ProbeRow>,
}
#[derive(serde::Deserialize)]
struct ProbeRow {
token: String,
benign_twin: String,
class: String,
}
let parsed: ProbeFile = toml::from_str(src)
.map_err(|e| WafModelError::Artifact(format!("parsing filter battery TOML: {e}")))?;
if parsed.probe.is_empty() {
return Err(WafModelError::Artifact(
"filter battery has no [[probe]] entries".into(),
));
}
let mut out = Vec::with_capacity(parsed.probe.len());
for row in parsed.probe {
let class = RuleGroup::ALL
.iter()
.copied()
.find(|g| g.name() == row.class)
.ok_or_else(|| {
WafModelError::Artifact(format!(
"unknown probe class {:?} (expected one of {:?})",
row.class,
RuleGroup::ALL.iter().map(|g| g.name()).collect::<Vec<_>>(),
))
})?;
let probe = TokenProbe::new(row.token, row.benign_twin, class);
probe
.validate()
.map_err(|why| WafModelError::Artifact(format!("invalid filter probe: {why}")))?;
out.push(probe);
}
Ok(out)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::oracle::FnOracle;
fn literal_token_waf(
policed: &'static [&'static str],
) -> FnOracle<impl FnMut(&Request) -> Result<Outcome>> {
FnOracle::new(move |req: &Request| {
let url = req.url();
let blocked = policed.iter().any(|tok| url.contains(tok));
Ok(if blocked {
Outcome::Block
} else {
Outcome::Pass
})
})
}
fn query_carrier(value: &str) -> Request {
Request::get(format!("https://target.test/search?q=lookup-{value}-end"))
}
#[test]
fn policed_token_is_isolated_from_its_benign_twin() {
let mut waf = literal_token_waf(&["<script>"]);
let battery = [TokenProbe::new(
"<script>",
"<scrupt>",
RuleGroup::CrossSiteScripting,
)];
let profile = characterize(&mut waf, &battery, query_carrier).unwrap();
assert_eq!(
profile.queries, 2,
"exactly two membership queries per probe"
);
assert_eq!(profile.transport_errors, 0);
assert!(profile.is_policed("<script>"));
let policed: Vec<_> = profile.policed().map(|f| f.token.as_str()).collect();
assert_eq!(policed, vec!["<script>"]);
}
#[test]
fn unpoliced_token_is_the_prize() {
let mut waf = literal_token_waf(&["union select"]);
let battery = [TokenProbe::new(
"<svg onload=",
"<svq onloxd=",
RuleGroup::CrossSiteScripting,
)];
let profile = characterize(&mut waf, &battery, query_carrier).unwrap();
assert!(!profile.is_policed("<svg onload="));
let unpoliced: Vec<_> = profile.unpoliced().map(|f| f.token.as_str()).collect();
assert_eq!(
unpoliced,
vec!["<svg onload="],
"an unpoliced token must be surfaced"
);
}
#[test]
fn carrier_gate_when_both_halves_block() {
let mut waf = literal_token_waf(&["<"]);
let battery = [TokenProbe::new(
"<script>",
"<scrupt>",
RuleGroup::CrossSiteScripting,
)];
let profile = characterize(&mut waf, &battery, query_carrier).unwrap();
assert!(
!profile.is_policed("<script>"),
"both-block must NOT read as Policed"
);
let gated: Vec<_> = profile.carrier_gated().map(|f| f.token.as_str()).collect();
assert_eq!(gated, vec!["<script>"]);
}
#[test]
fn transport_error_is_inconclusive_not_a_guess() {
let mut waf =
FnOracle::new(|_req: &Request| Err(crate::error::WafModelError::Oracle("down".into())));
let battery = default_battery();
let n = battery.len();
let profile = characterize(&mut waf, &battery, query_carrier).unwrap();
assert_eq!(profile.transport_errors, n as u64, "every probe erred");
assert!(profile.policed().next().is_none());
assert!(profile.unpoliced().next().is_none());
assert!(
profile
.findings
.iter()
.all(|f| f.verdict == Verdict::Inconclusive),
"transport failure must never produce an actionable verdict"
);
}
#[test]
fn from_outcomes_truth_table_is_exhaustive() {
assert_eq!(
Verdict::from_outcomes(Outcome::Block, Outcome::Pass),
Verdict::Policed
);
assert_eq!(
Verdict::from_outcomes(Outcome::Pass, Outcome::Pass),
Verdict::Unpoliced
);
assert_eq!(
Verdict::from_outcomes(Outcome::Block, Outcome::Block),
Verdict::CarrierGate
);
assert_eq!(
Verdict::from_outcomes(Outcome::Pass, Outcome::Block),
Verdict::Inconclusive
);
}
#[test]
fn default_battery_twins_preserve_structure_and_only_swap_letters() {
for p in default_battery() {
assert_eq!(
p.benign_twin.len(),
p.token.len(),
"twin {:?} must match token {:?} byte length",
p.benign_twin,
p.token
);
assert_ne!(p.token, p.benign_twin, "twin must differ from the token");
for (i, (tb, wb)) in p.token.bytes().zip(p.benign_twin.bytes()).enumerate() {
if tb != wb {
assert!(
tb.is_ascii_alphabetic() && wb.is_ascii_alphabetic(),
"at index {i}, twin {:?} differs from {:?} on a NON-letter byte \
({tb:#x} vs {wb:#x}) — that perturbs the structural skeleton",
p.benign_twin,
p.token
);
}
}
}
}
#[test]
fn embedded_default_battery_parses_and_is_non_trivial() {
let battery = default_battery();
assert!(battery.len() >= 10, "default battery unexpectedly small");
let classes: std::collections::HashSet<RuleGroup> =
battery.iter().map(|p| p.class).collect();
for required in [
RuleGroup::CrossSiteScripting,
RuleGroup::SqlInjection,
RuleGroup::FileInclusion,
RuleGroup::RemoteCodeExecution,
] {
assert!(
classes.contains(&required),
"default battery missing class {required:?}"
);
}
}
#[test]
fn battery_from_toml_round_trips_a_minimal_file() {
let src = r#"
[[probe]]
token = "<script>"
benign_twin = "<scrupt>"
class = "xss"
"#;
let battery = battery_from_toml(src).expect("valid battery");
assert_eq!(battery.len(), 1);
assert_eq!(battery[0].token, "<script>");
assert_eq!(battery[0].class, RuleGroup::CrossSiteScripting);
}
#[test]
fn battery_from_toml_fails_closed_on_a_structurally_invalid_twin() {
let src = r#"
[[probe]]
token = "<script>"
benign_twin = "<scrupx"
class = "xss"
"#;
let err = battery_from_toml(src).expect_err("must reject a bad twin");
assert!(
format!("{err}").contains("invalid filter probe"),
"got: {err}"
);
}
#[test]
fn battery_from_toml_rejects_unknown_class_and_empty_file() {
let bad_class = r#"
[[probe]]
token = "system("
benign_twin = "systxm("
class = "totally-made-up"
"#;
assert!(
battery_from_toml(bad_class).is_err(),
"unknown class must be rejected"
);
assert!(
battery_from_toml("# nothing here\n").is_err(),
"empty battery must be rejected"
);
}
fn pct_decode_once(s: &str) -> String {
let b = s.as_bytes();
let mut out = Vec::with_capacity(b.len());
let mut i = 0;
while i < b.len() {
if b[i] == b'%' && i + 2 < b.len() {
let hi = (b[i + 1] as char).to_digit(16);
let lo = (b[i + 2] as char).to_digit(16);
if let (Some(h), Some(l)) = (hi, lo) {
out.push((h * 16 + l) as u8);
i += 3;
continue;
}
}
out.push(b[i]);
i += 1;
}
String::from_utf8_lossy(&out).into_owned()
}
fn policed_script_profile() -> FilterProfile {
FilterProfile {
findings: vec![TokenFinding {
token: "<script>".to_string(),
class: RuleGroup::CrossSiteScripting,
verdict: Verdict::Policed,
}],
queries: 0,
transport_errors: 0,
}
}
#[test]
fn decode_gap_probe_finds_every_gap_against_a_no_decode_literal_waf() {
let mut waf = literal_token_waf(&["<script>"]);
let gaps = probe_decode_gaps(&mut waf, &policed_script_profile(), query_carrier).unwrap();
let stages: std::collections::HashSet<&str> = gaps.iter().map(|g| g.stage).collect();
assert!(
stages.contains("url_decode"),
"a no-decode WAF must expose the url_decode gap"
);
assert!(stages.contains("base64_decode"));
assert!(
gaps.iter().all(|g| g.token == "<script>"),
"every gap must be attributed to the policed token"
);
}
#[test]
fn decode_gap_probe_omits_the_transform_the_waf_actually_replicates() {
let mut waf = FnOracle::new(|req: &Request| {
let decoded = pct_decode_once(req.url());
Ok(if decoded.contains("<script>") {
Outcome::Block
} else {
Outcome::Pass
})
});
let gaps = probe_decode_gaps(&mut waf, &policed_script_profile(), query_carrier).unwrap();
let stages: std::collections::HashSet<&str> = gaps.iter().map(|g| g.stage).collect();
assert!(
!stages.contains("url_decode"),
"the WAF replicates url-decode, so it must NOT be reported as a gap: {stages:?}"
);
assert!(
stages.contains("double_url_decode"),
"the WAF does not double-url-decode → that IS a gap: {stages:?}"
);
assert!(
stages.contains("base64_decode"),
"the WAF does not base64-decode → that IS a gap: {stages:?}"
);
}
#[test]
fn decode_gap_probe_is_empty_when_no_token_is_policed() {
let mut waf = literal_token_waf(&["<script>"]);
let empty = FilterProfile::default();
let gaps = probe_decode_gaps(&mut waf, &empty, query_carrier).unwrap();
assert!(gaps.is_empty());
assert_eq!(
waf.queries(),
0,
"no policed token ⇒ no decode-gap probes sent"
);
}
#[test]
fn full_default_battery_classifies_a_mixed_waf_end_to_end() {
let mut waf = literal_token_waf(&["<script>", "union select", "/etc/passwd"]);
let profile = characterize(&mut waf, &default_battery(), query_carrier).unwrap();
assert!(profile.is_policed("<script>"));
assert!(profile.is_policed("union select"));
assert!(profile.is_policed("/etc/passwd"));
let unpoliced: std::collections::HashSet<_> =
profile.unpoliced().map(|f| f.token.as_str()).collect();
assert!(
unpoliced.contains("system("),
"unpoliced RCE token must be surfaced"
);
assert!(unpoliced.contains(";bash -i"));
assert_eq!(profile.transport_errors, 0);
assert_eq!(profile.queries, default_battery().len() as u64 * 2);
}
}