wafrift-wafmodel 0.3.1

//! The defensive dual: from a decompiled WAF's holes, synthesize the
//! *minimal* rules that close them, prove zero new false positives
//! against a benign corpus, and **prove the class is closed**
//! (no attack-grammar member survives the hardened config).
//!
//! Offensive WAF tools are commodities. A tool that hands a defender a
//! verified, FP-free patch *and a proof the gap is gone* is a category
//! nobody owns. The proof is constructive and checkable: build the
//! hardened WAF, enumerate the attack class, assert none passes; the
//! negative twin (drop one synthesized rule ⇒ exactly its holes
//! reopen) proves every rule is load-bearing, not decorative.

use crate::learn::Alphabet;
use crate::mine::attack_grammar;
use crate::normalize::{self, Transform};
use crate::oracle::{ChannelSet, Rule, SimRegexWaf};
use crate::outcome::Outcome;
use crate::sfa::Sfa;
use regex::bytes::Regex;
use wafrift_types::Request;

/// What the hardener produced and proved.
#[derive(Debug)]
pub struct ClosureReport {
    /// The synthesized rules a defender deploys.
    pub added_rules: Vec<Rule>,
    /// Attack-class members the WAF passed *before* hardening.
    pub holes_before: usize,
    /// Attack-class members the WAF passes *after* hardening.
    pub holes_after: usize,
    /// Benign-corpus requests the synthesized rules wrongly block.
    pub benign_false_positives: usize,
    /// `true` iff `holes_after == 0` *and* `benign_false_positives == 0`
    /// — closure proven with no precision regression. Never asserted
    /// on faith; computed from the hardened WAF.
    pub proven_closed: bool,
}

fn body(bytes: &[u8]) -> Request {
    Request::post("https://h/p", bytes.to_vec()).header("Content-Type", "application/json")
}

/// Enumeration cap for hole-search. Bounded to keep memory + WAF
/// query count predictable on huge attack grammars. A hit at this
/// boundary means the enumeration was TRUNCATED, not exhaustive —
/// see `holes()` for how that fact propagates into `proven_closed`.
const HOLES_ENUM_CAP: usize = 4096;

/// Enumerate attack-class members (bounded) that `waf` lets through.
/// Returns `(holes, truncated)`. `truncated == true` means the hole
/// collector itself hit `HOLES_ENUM_CAP` HOLES — there are at least
/// that many attack-class members the WAF passes; callers MUST NOT
/// claim "proven_closed" in that case.
///
/// Crucially, `truncated` is based on the count of HOLES FOUND, not
/// the total number of attack strings examined. A large attack grammar
/// whose members are ALL blocked after hardening can be exhaustively
/// checked without setting `truncated`, so `proven_closed` remains
/// accurate even when the grammar has millions of members.
fn holes(waf: &SimRegexWaf, attack: &Sfa, max_len: usize) -> (Vec<Vec<u8>>, bool) {
    // Enumerate a large prefix of the attack grammar. We examine up to
    // HOLES_ENUM_CAP × 16 attack strings so we don't stop searching too
    // early when most strings are blocked. Increasing this multiplier
    // extends coverage at the cost of WAF-classify calls.
    const ATTACK_STRINGS_BUDGET: usize = HOLES_ENUM_CAP * 16;
    let enumerated = attack.enumerate_accepted(ATTACK_STRINGS_BUDGET, max_len);
    let mut holes = Vec::new();
    for bytes in enumerated {
        if waf.classify_uncounted(&body(&bytes)) == Outcome::Pass {
            holes.push(bytes);
            if holes.len() >= HOLES_ENUM_CAP {
                return (holes, true); // HOLES cap hit — truncated
            }
        }
    }
    (holes, false)
}

/// Synthesize the minimal rules that close every hole for the attack
/// class defined by `needles`, verify zero false positives on
/// `benign`, and prove the class is closed.
///
/// The closing pattern is *derived from the hole*: the attack token
/// the WAF missed, matched under CRS-grade normalization
/// (urlDecodeUni → htmlEntityDecode → lowercase) so encoded/cased
/// evasions are closed too — not a literal-only band-aid. A
/// synthesized rule that would false-positive on the benign corpus is
/// reported (it does not silently ship).
#[must_use]
pub fn synthesize_closure(
    waf: &SimRegexWaf,
    needles: &[&[u8]],
    channels: ChannelSet,
    benign: &[&[u8]],
    alpha: &Alphabet,
    max_len: usize,
) -> ClosureReport {
    let grammar = attack_grammar(alpha, needles);
    let (before, _before_truncated) = holes(waf, &grammar, max_len);

    // One closing rule per needle: match the (lowercased) token after
    // CRS-grade decoding, scoring at the inbound threshold so a single
    // hit blocks.
    let tf = vec![
        Transform::UrlDecodeUni,
        Transform::HtmlEntityDecode,
        Transform::Lowercase,
    ];
    // Multiple raw needles can decode to the same canonical form (e.g.
    // `<x`, `%3cx`, `&lt;x` all → `<x` after `tf`). Dedup so we don't
    // emit two rules with the same id — that would violate the
    // "proven_closed" contract (ClosureReport sees duplicate rule ids
    // as a malformed closure).
    let mut added = Vec::new();
    let mut seen_decoded = std::collections::HashSet::<Vec<u8>>::new();
    for n in needles {
        // Build the pattern on the DECODED form of the needle, not the raw
        // needle bytes. The synthesized rule applies `tf` to the request body
        // before testing the pattern — so the pattern must match what the body
        // looks like AFTER that transform chain, i.e. the decoded/lowercased
        // needle. Without this, a rule for needle `%3cx` would search for the
        // literal string `%3cx` in a body that has already been URL-decoded to
        // `<x` — the pattern can never match and the hole remains open.
        let decoded = normalize::apply_chain(&tf, n);
        if !seen_decoded.insert(decoded.clone()) {
            continue;
        }
        let pat = regex::escape(&String::from_utf8_lossy(&decoded));
        if let Ok(re) = Regex::new(&pat) {
            added.push(Rule {
                id: format!("synth-close-{}", String::from_utf8_lossy(&decoded)),
                channels,
                transforms: tf.clone(),
                pattern: re,
                score: waf.threshold(),
            });
        }
    }

    let hardened = waf.with_rules_added(added.clone());

    // Zero-FP proof: the hardened WAF must not block any benign input
    // the *original* WAF allowed (the synthesized rules add no FP).
    let benign_fp = benign
        .iter()
        .filter(|b| {
            waf.classify_uncounted(&body(b)) == Outcome::Pass
                && hardened.classify_uncounted(&body(b)) == Outcome::Block
        })
        .count();

    // Closure proof: re-enumerate the attack class against the
    // hardened WAF — every member must now be blocked.
    let (after, after_truncated) = holes(&hardened, &grammar, max_len);

    ClosureReport {
        added_rules: added,
        holes_before: before.len(),
        holes_after: after.len(),
        benign_false_positives: benign_fp,
        // F102: pre-fix `proven_closed = after.is_empty() && benign_fp
        // == 0`. If the post-hardening enumeration hit the 4096-cap
        // and ALL 4096 examined were blocked, the flag flipped true —
        // but longer attack members beyond the cap might still pass.
        // A defender deploys the synthesized rules trusting a proof
        // that doesn't hold. Require the enumeration to have been
        // exhaustive (`!after_truncated`) before claiming closure.
        proven_closed: after.is_empty() && benign_fp == 0 && !after_truncated,
    }
}