Skip to main content

wafrift_encoding/
url_mutate.rs

1//! URL / query-string payload mutation — opt-in attack surface for
2//! the proxy `--mutate-url` flag and the strategy engine's URL-aware
3//! evade variants.
4//!
5//! Most production attacks live in the URL, not the request body:
6//! `?id=1' OR 1=1--`, `?q=<script>alert(1)</script>`,
7//! `?file=../../etc/passwd`. The default proxy pipeline only mutates
8//! HTTP-layer artefacts (headers, body) which leaves this surface
9//! uncovered. This module fills that gap when the operator opts in.
10//!
11//! Scope:
12//! - mutates query parameter VALUES (not names — those drive routing)
13//! - optionally mutates the path's last segment (rest is routing)
14//! - never touches the host / scheme / port — those are pre-routing
15//! - returns the URL unchanged when no `?` is present and path
16//!   mutation is disabled
17//!
18//! Mutation strategies are intentionally a small fixed set chosen to
19//! be effective against signature WAFs without requiring the heavier
20//! grammar/encoding pipeline. Callers that want full pipeline
21//! mutation should round-trip through `wafrift_strategy::evade` with
22//! the parameter value lifted into the request body.
23
24use std::borrow::Cow;
25
26/// Knobs for [`mutate_url`].
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct UrlMutateConfig {
29    /// Mutate the query string. Default true.
30    pub mutate_query_values: bool,
31    /// Mutate the path's last segment (everything after the last `/`).
32    /// Default false — disabled because changing path semantics is
33    /// likely to break routing on most targets.
34    pub mutate_last_path_segment: bool,
35    /// Strategy to apply per value.
36    pub strategy: UrlStrategy,
37}
38
39impl Default for UrlMutateConfig {
40    fn default() -> Self {
41        Self {
42            mutate_query_values: true,
43            mutate_last_path_segment: false,
44            strategy: UrlStrategy::PercentEncodeAggressive,
45        }
46    }
47}
48
49/// Hard cap on the input size accepted by [`UrlStrategy::DoublePercentEncode`].
50/// Two passes of aggressive percent-encoding can produce up to ~9×
51/// the input length, so an unbounded input is a DoS vector. Real WAF
52/// values are kilobytes at most; 1 MB is generous.
53pub const MAX_DOUBLE_ENCODE_INPUT: usize = 1024 * 1024;
54
55/// Per-value mutation choice.
56#[derive(Debug, Clone, Copy, PartialEq, Eq)]
57pub enum UrlStrategy {
58    /// Percent-encode every byte that isn't alphanumeric. Most signatures
59    /// match decoded payloads but verify by raw-byte regex — this
60    /// breaks both checks at once.
61    PercentEncodeAggressive,
62    /// Double-percent-encode (`%` → `%25`, then percent-encode again).
63    /// Bypasses URL-decode-then-match WAFs that decode exactly once.
64    DoublePercentEncode,
65    /// Mix in `+` for spaces, `0x2F` for `/`, etc. — non-canonical
66    /// encodings that some upstream parsers normalise but signatures
67    /// don't.
68    NonCanonicalSpaces,
69    /// Insert empty PHP-style array brackets `[]` after the param name
70    /// to force HTTP Parameter Pollution path. Only meaningful when
71    /// the *name* needs to change; otherwise no-op.
72    Hpp,
73}
74
75impl UrlStrategy {
76    /// Apply the strategy to a single decoded value, returning the
77    /// mutated raw form (already URL-safe — caller does not re-encode).
78    #[must_use]
79    pub fn apply(self, value: &str) -> String {
80        self.apply_bytes(value.as_bytes())
81    }
82
83    /// Byte-clean variant of [`Self::apply`] for percent-encoding
84    /// strategies. Lets callers run a non-UTF-8 byte sequence (e.g.
85    /// the raw bytes from a percent-decode on `%FF%FE`) through the
86    /// pipeline without it being silently rewritten to U+FFFD by
87    /// `String::from_utf8_lossy`. Each strategy that only operates
88    /// on bytes (PercentEncodeAggressive, DoublePercentEncode) is
89    /// byte-pure here. Strategies that need character semantics
90    /// (NonCanonicalSpaces) lossy-convert internally.
91    #[must_use]
92    pub fn apply_bytes(self, value: &[u8]) -> String {
93        match self {
94            Self::PercentEncodeAggressive => percent_encode_aggressive_bytes(value),
95            Self::DoublePercentEncode => {
96                // Two passes of aggressive percent-encoding can blow
97                // up to roughly 9× the input size on worst-case
98                // inputs (every byte → %XX → %25%XX). Cap the input
99                // so a malicious caller can't OOM via a 100 MB
100                // string asking for 900 MB of output.
101                if value.len() > MAX_DOUBLE_ENCODE_INPUT {
102                    return percent_encode_aggressive_bytes(value);
103                }
104                let first = percent_encode_aggressive_bytes(value);
105                percent_encode_aggressive_bytes(first.as_bytes())
106            }
107            Self::NonCanonicalSpaces => {
108                // NonCanonicalSpaces is char-based by design (its
109                // rules target Unicode whitespace + ASCII glyphs);
110                // lossy-convert here so the str path stays sane.
111                let s = String::from_utf8_lossy(value);
112                non_canonical_spaces(&s)
113            }
114            Self::Hpp => String::from_utf8_lossy(value).into_owned(),
115        }
116    }
117
118    /// Stable name used for technique logging.
119    #[must_use]
120    pub fn label(self) -> &'static str {
121        match self {
122            Self::PercentEncodeAggressive => "url:percent_encode",
123            Self::DoublePercentEncode => "url:double_percent",
124            Self::NonCanonicalSpaces => "url:noncanon_spaces",
125            Self::Hpp => "url:hpp",
126        }
127    }
128}
129
130/// Mutate `path_and_query` (no scheme/host) per `cfg`. Returns the
131/// mutated string and a list of technique labels actually applied.
132///
133/// Inputs are accepted in either form:
134///   `/path/segment?a=1&b=2`
135///   `/path/segment`            (no query — query mutation is a no-op)
136///   `?a=1`                     (no path — path mutation is a no-op)
137///   `/path?a=1#frag`           (fragment preserved verbatim)
138///
139/// Never panics, never returns empty for non-empty input.
140#[must_use]
141pub fn mutate_url(path_and_query: &str, cfg: &UrlMutateConfig) -> (String, Vec<&'static str>) {
142    // Reject full URLs (with scheme://host/...) at the boundary —
143    // mutate_url's contract is "path-and-query only". Pre-fix a full
144    // URL got split on '?' such that the scheme + host leaked into
145    // the "path" and got mutated, e.g. `https://example.com/p?q=1`
146    // had `https://example.com/p` percent-encoded as the last path
147    // segment. The caller almost certainly meant to pass the
148    // path-and-query directly; pass-through is the safe behaviour.
149    if path_and_query.starts_with("http://")
150        || path_and_query.starts_with("https://")
151        || path_and_query.starts_with("//")
152    {
153        return (path_and_query.to_string(), Vec::new());
154    }
155
156    // Split off any #fragment FIRST so query mutation can't encode the
157    // '#' delimiter and destroy fragment routing. Pre-fix the
158    // mutator turned `/p?q=1#frag` into `/p?q=1%23frag`, which the
159    // upstream then treated as a single (broken) query value.
160    let (without_frag, fragment) = match path_and_query.split_once('#') {
161        Some((rest, frag)) => (rest, Some(frag)),
162        None => (path_and_query, None),
163    };
164
165    let (path, query) = match without_frag.split_once('?') {
166        Some((p, q)) => (p.to_string(), Some(q.to_string())),
167        None => (without_frag.to_string(), None),
168    };
169    let mut techniques: Vec<&'static str> = Vec::new();
170
171    let new_path = if cfg.mutate_last_path_segment {
172        match mutate_last_segment(&path, cfg.strategy) {
173            Some(p) => {
174                techniques.push("url:path_segment");
175                techniques.push(cfg.strategy.label());
176                p
177            }
178            None => path,
179        }
180    } else {
181        path
182    };
183
184    let new_query = if cfg.mutate_query_values {
185        if let Some(q) = query.as_ref() {
186            let (mq, applied) = mutate_query_string(q, cfg.strategy);
187            if applied {
188                techniques.push("url:query_values");
189                techniques.push(cfg.strategy.label());
190            }
191            Some(mq)
192        } else {
193            query
194        }
195    } else {
196        query
197    };
198
199    let mut result = match new_query {
200        Some(q) => format!("{new_path}?{q}"),
201        None => new_path,
202    };
203    if let Some(frag) = fragment {
204        result.push('#');
205        result.push_str(frag);
206    }
207    (result, techniques)
208}
209
210fn mutate_last_segment(path: &str, strategy: UrlStrategy) -> Option<String> {
211    // Treat both literal '/' and percent-encoded slash (%2F or %2f)
212    // as segment boundaries — otherwise an attacker who pre-encodes
213    // a slash inside what looks like the last segment (e.g.
214    // /a/b%2Fc) would have the WHOLE tail (b%2Fc) mutated, when the
215    // logical last segment is `c`.
216    let normalized_last_slash = {
217        let lit = path.rfind('/');
218        let pct_upper = path.rfind("%2F").map(|i| i + 2);
219        let pct_lower = path.rfind("%2f").map(|i| i + 2);
220        [lit, pct_upper, pct_lower].into_iter().flatten().max()?
221    };
222    let (head, tail) = path.split_at(normalized_last_slash + 1);
223    if tail.is_empty() {
224        return None;
225    }
226    // Decode pre-existing percent escapes BEFORE re-applying the
227    // mutation strategy, into raw bytes (NOT through from_utf8_lossy)
228    // so that `%FF%FE` and other non-UTF-8 byte sequences survive
229    // the round-trip instead of being silently mangled into U+FFFD
230    // sequences (`%EF%BF%BD`).
231    let decoded = percent_decode_bytes(tail);
232    let mutated = strategy.apply_bytes(&decoded);
233    Some(format!("{head}{mutated}"))
234}
235
236/// Mutate every `name=value` pair, leaving `name` alone and mutating
237/// `value`. Pairs without `=` (bare flags) are passed through.
238///
239/// Empty pairs (consecutive `&&` separators) are PRESERVED rather
240/// than collapsed — some upstream frameworks (e.g. PHP, Rails 5+)
241/// treat them as distinct empty parameters, so collapsing changes
242/// the parsed parameter count.
243///
244/// `+` in a query value is interpreted as space per RFC 1866 form
245/// encoding before the strategy is applied — otherwise `q=1+1`
246/// would be mutated as if `+` were a literal plus sign.
247fn mutate_query_string(query: &str, strategy: UrlStrategy) -> (String, bool) {
248    let mut out = Vec::with_capacity(8);
249    let mut applied = false;
250    for pair in query.split('&') {
251        if pair.is_empty() {
252            // Preserve `&&` so the upstream sees the original
253            // parameter count.
254            out.push(String::new());
255            continue;
256        }
257        if let Some((name, value)) = pair.split_once('=') {
258            if value.is_empty() {
259                out.push(format!("{name}="));
260                continue;
261            }
262            // Form-decode `+` to space BEFORE percent-decoding so
263            // application/x-www-form-urlencoded semantics survive
264            // the mutation pipeline. Decode into raw bytes (NOT
265            // from_utf8_lossy) so non-UTF-8 escapes survive — see
266            // percent_decode_bytes for the U+FFFD-avoidance rationale.
267            let form_decoded = value.replace('+', " ");
268            let decoded = percent_decode_bytes(&form_decoded);
269            let mutated = strategy.apply_bytes(&decoded);
270            // Only set applied=true if the mutator actually changed
271            // the value. An all-alphanumeric value passed through
272            // PercentEncodeAggressive comes out byte-equal; reporting
273            // a technique was applied would falsely inflate the
274            // technique log.
275            if mutated.as_bytes() != value.as_bytes() {
276                applied = true;
277            }
278            out.push(format!("{name}={mutated}"));
279        } else {
280            out.push(pair.to_string());
281        }
282    }
283    (out.join("&"), applied)
284}
285
286/// Aggressive percent-encoding: every byte that is not `[A-Za-z0-9]`
287/// is encoded. Drops the URL safe-list (`-._~`) intentionally — those
288/// are the bytes signatures most often fail to canonicalise.
289#[allow(dead_code)]
290fn percent_encode_aggressive(s: &str) -> String {
291    percent_encode_aggressive_bytes(s.as_bytes())
292}
293
294/// Byte-clean variant of [`percent_encode_aggressive`]. Used by the
295/// byte-pipeline paths so non-UTF-8 input bytes (which a real
296/// `%FF%FE`-style WAF-bypass payload contains) survive end-to-end
297/// instead of being silently rewritten to U+FFFD.
298fn percent_encode_aggressive_bytes(bytes: &[u8]) -> String {
299    let mut out = String::with_capacity(bytes.len().saturating_mul(3));
300    for &b in bytes {
301        if b.is_ascii_alphanumeric() {
302            out.push(b as char);
303        } else {
304            use std::fmt::Write;
305            let _ = write!(&mut out, "%{b:02X}");
306        }
307    }
308    out
309}
310
311fn non_canonical_spaces(s: &str) -> String {
312    // saturating_mul to avoid usize overflow on 32-bit targets when
313    // someone hands us a ~2 GB string.
314    let mut out = String::with_capacity(s.len().saturating_mul(3));
315    // Pre-fix the `_ => out.push(other)` arm passed through `&`, `=`,
316    // `%`, `#`, `+`, `?`, `\0`, control chars, etc. After percent-decode
317    // had already turned `%26c%3Devil` into the literal bytes `&c=evil`,
318    // this re-emitted them verbatim and the server then split the value
319    // on `&` and `=` into THREE pairs — HTTP parameter injection. The
320    // audit caught this as CRITICAL.
321    //
322    // Fix: percent-encode every byte that would be parsed as URL/form
323    // structure or as an ASCII control. The cosmetic substitutions above
324    // (` `→`+`, `/`→`%2F`, etc.) are kept for the WAF-bypass shape; the
325    // dangerous bytes get the standard `%XX` form.
326    for ch in s.chars() {
327        match ch {
328            ' ' => out.push('+'),
329            '/' => out.push_str("%2F"),
330            '\\' => out.push_str("%5C"),
331            '<' => out.push_str("%3C"),
332            '>' => out.push_str("%3E"),
333            '\'' => out.push_str("%27"),
334            '"' => out.push_str("%22"),
335            '(' => out.push_str("%28"),
336            ')' => out.push_str("%29"),
337            // Structural URL / form delimiters — must always be encoded
338            // so they cannot escape the value into a sibling pair.
339            '&' => out.push_str("%26"),
340            '=' => out.push_str("%3D"),
341            '%' => out.push_str("%25"),
342            '#' => out.push_str("%23"),
343            '?' => out.push_str("%3F"),
344            '+' => out.push_str("%2B"),
345            ';' => out.push_str("%3B"),
346            // Control chars (incl. NUL): %XX-encode exactly.
347            other if (other as u32) < 0x20 || other as u32 == 0x7F => {
348                use std::fmt::Write;
349                let _ = write!(&mut out, "%{:02X}", other as u32);
350            }
351            other => out.push(other),
352        }
353    }
354    out
355}
356
357/// Decode `%xx` escapes into raw bytes, treating invalid sequences
358/// (lone `%`, `%G1`) as literal. Unlike [`percent_decode_lossy`],
359/// this never round-trips through `from_utf8_lossy` so non-UTF-8
360/// byte sequences (e.g. `%FF%FE`, overlong UTF-8 `%C0%AF`) survive
361/// intact. The downstream encoders re-emit them as exact `%XX`
362/// pairs instead of mangling them into `%EF%BF%BD` (U+FFFD), which
363/// is what removes WAF-bypass vectors.
364fn percent_decode_bytes(s: &str) -> Vec<u8> {
365    let bytes = s.as_bytes();
366    let mut out = Vec::with_capacity(bytes.len());
367    let mut i = 0;
368    while i < bytes.len() {
369        if bytes[i] == b'%'
370            && i + 2 < bytes.len()
371            && let (Some(h), Some(l)) = (hex_digit(bytes[i + 1]), hex_digit(bytes[i + 2]))
372        {
373            out.push(h * 16 + l);
374            i += 3;
375            continue;
376        }
377        out.push(bytes[i]);
378        i += 1;
379    }
380    out
381}
382
383/// Decode `%xx` escapes lossily, treating invalid sequences as
384/// literal. Returns `Cow::Borrowed` when nothing needed decoding.
385#[allow(dead_code)]
386fn percent_decode_lossy(s: &str) -> Cow<'_, str> {
387    if !s.contains('%') {
388        return Cow::Borrowed(s);
389    }
390    let bytes = s.as_bytes();
391    let mut out = Vec::with_capacity(bytes.len());
392    let mut i = 0;
393    while i < bytes.len() {
394        if bytes[i] == b'%'
395            && i + 2 < bytes.len()
396            && let (Some(h), Some(l)) = (hex_digit(bytes[i + 1]), hex_digit(bytes[i + 2]))
397        {
398            out.push(h * 16 + l);
399            i += 3;
400            continue;
401        }
402        out.push(bytes[i]);
403        i += 1;
404    }
405    Cow::Owned(String::from_utf8_lossy(&out).into_owned())
406}
407
408fn hex_digit(b: u8) -> Option<u8> {
409    match b {
410        b'0'..=b'9' => Some(b - b'0'),
411        b'a'..=b'f' => Some(b - b'a' + 10),
412        b'A'..=b'F' => Some(b - b'A' + 10),
413        _ => None,
414    }
415}
416
417#[cfg(test)]
418mod tests {
419    use super::*;
420
421    fn cfg(strategy: UrlStrategy, mutate_path: bool) -> UrlMutateConfig {
422        UrlMutateConfig {
423            mutate_query_values: true,
424            mutate_last_path_segment: mutate_path,
425            strategy,
426        }
427    }
428
429    // ── default-OFF semantics ──────────────────────────────────
430
431    #[test]
432    fn default_config_does_not_touch_path() {
433        let c = UrlMutateConfig::default();
434        assert!(!c.mutate_last_path_segment);
435        let (out, _) = mutate_url("/admin/login?id=1", &c);
436        assert!(
437            out.starts_with("/admin/login?"),
438            "path must stay verbatim, got {out}"
439        );
440    }
441
442    #[test]
443    fn no_query_no_path_mutation_returns_input_unchanged() {
444        let c = UrlMutateConfig::default();
445        let (out, techniques) = mutate_url("/just/a/path", &c);
446        assert_eq!(out, "/just/a/path");
447        assert!(
448            techniques.is_empty(),
449            "no mutation must report no technique"
450        );
451    }
452
453    #[test]
454    fn empty_value_pair_passes_through_unmutated() {
455        let c = UrlMutateConfig::default();
456        let (out, _) = mutate_url("/p?a=&b=2", &c);
457        assert!(out.contains("a=&"), "empty value must stay empty");
458    }
459
460    #[test]
461    fn bare_flag_param_passes_through() {
462        let c = UrlMutateConfig::default();
463        let (out, _) = mutate_url("/p?flag&other=1", &c);
464        assert!(out.contains("flag&"));
465    }
466
467    // ── per-strategy correctness ───────────────────────────────
468
469    #[test]
470    fn percent_encode_aggressive_encodes_quotes_and_spaces() {
471        let c = cfg(UrlStrategy::PercentEncodeAggressive, false);
472        let (out, t) = mutate_url("/p?id=1' OR '1'='1", &c);
473        // Every non-alphanumeric must be encoded.
474        assert!(out.contains("id=1%27%20OR%20%271%27%3D%271"), "got {out}");
475        assert!(t.contains(&"url:percent_encode"));
476        assert!(t.contains(&"url:query_values"));
477    }
478
479    #[test]
480    fn percent_encode_aggressive_skips_alphanumerics() {
481        let c = cfg(UrlStrategy::PercentEncodeAggressive, false);
482        let (out, _) = mutate_url("/p?q=ABCxyz123", &c);
483        assert!(
484            out.ends_with("q=ABCxyz123"),
485            "alnum must not be encoded; got {out}"
486        );
487    }
488
489    #[test]
490    fn double_percent_encode_doubles_each_byte() {
491        let c = cfg(UrlStrategy::DoublePercentEncode, false);
492        let (out, _) = mutate_url("/p?id='", &c);
493        // "'" → %27 → %2527
494        assert!(out.contains("id=%2527"), "got {out}");
495    }
496
497    #[test]
498    fn non_canonical_spaces_swaps_known_chars() {
499        let c = cfg(UrlStrategy::NonCanonicalSpaces, false);
500        let (out, _) = mutate_url("/p?q=hello world<", &c);
501        assert!(out.contains("q=hello+world%3C"), "got {out}");
502    }
503
504    // ── path-segment mutation ──────────────────────────────────
505
506    #[test]
507    fn path_segment_mutation_changes_last_segment_only_when_enabled() {
508        let c = cfg(UrlStrategy::PercentEncodeAggressive, true);
509        // Tail contains `.` (non-alphanumeric) so the strategy bites.
510        let (out, t) = mutate_url("/api/v1/admin.php", &c);
511        assert!(out.starts_with("/api/v1/"), "head must stay; got {out}");
512        assert_ne!(out, "/api/v1/admin.php", "tail must change; got {out}");
513        assert!(
514            out.contains("admin%2Ephp"),
515            "dot must be percent-encoded; got {out}"
516        );
517        assert!(t.contains(&"url:path_segment"));
518    }
519
520    #[test]
521    fn path_with_trailing_slash_is_not_mutated() {
522        let c = cfg(UrlStrategy::PercentEncodeAggressive, true);
523        let (out, t) = mutate_url("/api/v1/admin/", &c);
524        // Empty tail after the trailing `/` → no mutation
525        assert_eq!(out, "/api/v1/admin/");
526        assert!(t.is_empty());
527    }
528
529    // ── round-tripping pre-encoded input ──────────────────────
530
531    #[test]
532    fn pre_encoded_query_value_is_decoded_then_re_mutated() {
533        // Operator's input is `%27` (encoded `'`); we should decode
534        // first and then apply the strategy so we don't end up
535        // double-encoding accidentally on PercentEncodeAggressive.
536        let c = cfg(UrlStrategy::PercentEncodeAggressive, false);
537        let (out, _) = mutate_url("/p?q=%27OR%27", &c);
538        // Decoded: `'OR'` → re-aggressive-encoded: `%27OR%27`
539        assert!(out.contains("q=%27OR%27"));
540    }
541
542    // ── adversarial / robustness ──────────────────────────────
543
544    #[test]
545    fn does_not_panic_on_invalid_percent_escape() {
546        let c = UrlMutateConfig::default();
547        // %ZZ is invalid — must be treated as literal `%ZZ`
548        let _ = mutate_url("/p?q=%ZZbad", &c);
549    }
550
551    #[test]
552    fn does_not_panic_on_empty_input() {
553        let c = UrlMutateConfig::default();
554        let (out, _) = mutate_url("", &c);
555        assert_eq!(out, "");
556    }
557
558    #[test]
559    fn does_not_panic_on_trailing_question_mark() {
560        let c = UrlMutateConfig::default();
561        let (out, _) = mutate_url("/p?", &c);
562        assert_eq!(out, "/p?");
563    }
564
565    #[test]
566    fn handles_extremely_long_value() {
567        let c = UrlMutateConfig::default();
568        let long = "A".repeat(50_000);
569        let (out, _) = mutate_url(&format!("/p?q={long}"), &c);
570        // Alphanumeric → unchanged (50K A's)
571        assert!(out.ends_with(&long), "alnum long string must pass through");
572    }
573
574    #[test]
575    fn multiple_pairs_each_get_mutated_independently() {
576        let c = cfg(UrlStrategy::PercentEncodeAggressive, false);
577        let (out, _) = mutate_url("/p?a=1'&b=2\"&c=3", &c);
578        assert!(out.contains("a=1%27"));
579        assert!(out.contains("b=2%22"));
580        assert!(out.contains("c=3"));
581    }
582
583    #[test]
584    fn query_value_containing_equals_preserves_extra_equals() {
585        let c = UrlMutateConfig::default();
586        // `?key=base64==` is common (b64 padding)
587        let (out, _) = mutate_url("/p?key=b64==", &c);
588        // First `=` is the separator; "b64==" is the value
589        assert!(out.starts_with("/p?key="));
590    }
591}