Skip to main content

wafrift_encoding/encoding/
invisible.rs

1//! Invisible-character & tag-character encoders.
2//!
3//! A class of encodings the rest of `unicode.rs` doesn't cover. They share
4//! one trait: the rendered or normalized string LOOKS exactly like the
5//! original to a human or to a downstream tokenizer, but the byte stream a
6//! WAF inspects bears no resemblance to the keywords it has rules for.
7//!
8//! - **Tag characters (U+E0000–U+E007F, "Plan 9 tags").** Each ASCII
9//!   codepoint `c` has a tag-equivalent at `U+E0000 + c`. Strip them and
10//!   you recover the original ASCII. Prompt-injection research has shown
11//!   modern LLM tokenizers preserve and decode these — meaning an
12//!   LLM-backed WAF will see a benign-looking blob while still receiving
13//!   the attack tokens.
14//! - **Variation selectors (U+FE00–U+FE0F, U+E0100–U+E01EF).** Originally
15//!   for emoji presentation. Some normalizers strip them; some preserve
16//!   them. A WAF that strips has to choose to strip every codepoint in
17//!   two non-contiguous ranges, which most don't.
18//! - **Stylistic ligatures (U+FB00–U+FB06).** `ff`/`fi`/`fl`/`ffi`/`ffl`/
19//!   `ſt`/`st`. NFKC decomposes them; non-NFKC tokenizers see them as
20//!   single codepoints not in any keyword. Defeats post-normalization
21//!   filters that operate on the unnormalized stream.
22//! - **Enclosed alphanumerics (U+24B6–U+24E9 circled, U+1F110–U+1F12B
23//!   parenthesized).** Compatibility-decompose to plain Latin under NFKC.
24//!   Backends that NFKC see the keyword; WAFs that don't, don't.
25//! - **Soft hyphen / format chars (U+00AD, U+200B–U+200D, U+2060,
26//!   U+FEFF).** Some of these already live in
27//!   `unicode::zero_width_inject` for selective injection. This
28//!   module exposes them as a Strategy-compatible whole-string encoder
29//!   too, for cases where the engine wants to swap encoders rather than
30//!   compose them.
31//!
32//! All encoders here preserve UTF-8 validity and are byte-deterministic
33//! given the same input. None of them require entropy.
34//!
35//! # Why a new module
36//!
37//! `unicode.rs` is already 17K LOC of encoders. The encoders here belong
38//! together as a class — "looks identical, parses identical, byte stream
39//! is unrecognizable" — and putting them next to the case-folding /
40//! homoglyph / math-alphabet encoders would dilute that boundary.
41
42/// Encode every ASCII byte as its Plan 9 tag-character equivalent.
43///
44/// Input: any UTF-8 string. Output: UTF-8 string where every ASCII
45/// codepoint `c` (0–0x7F) has been replaced by `U+E0000 + c` (a tag
46/// character). Non-ASCII codepoints pass through unchanged.
47///
48/// Reversible: strip every codepoint in `U+E0000..=U+E007F` and the
49/// original ASCII falls out.
50#[must_use]
51pub fn tag_char_encode(input: &str) -> String {
52    let mut out = String::with_capacity(input.len() * 4);
53    for c in input.chars() {
54        let cp = c as u32;
55        if cp <= 0x7F {
56            // SAFETY: U+E0000 + ASCII < U+E0080 is a valid assigned plane-14 codepoint.
57            if let Some(tag) = char::from_u32(0xE0000 + cp) {
58                out.push(tag);
59                continue;
60            }
61        }
62        out.push(c);
63    }
64    out
65}
66
67/// Append a variation selector (U+FE0F by default) after every codepoint.
68///
69/// Renders identically; bytes are not. `selector` must be in
70/// `U+FE00..=U+FE0F` or `U+E0100..=U+E01EF`; out-of-range values are
71/// silently coerced to `U+FE0F`.
72#[must_use]
73pub fn variation_selector_pad(input: &str, selector: char) -> String {
74    let sel = match selector as u32 {
75        0xFE00..=0xFE0F | 0xE0100..=0xE01EF => selector,
76        _ => '\u{FE0F}',
77    };
78    let mut out = String::with_capacity(input.len() * 2 + input.chars().count() * sel.len_utf8());
79    for c in input.chars() {
80        out.push(c);
81        out.push(sel);
82    }
83    out
84}
85
86/// Pad every codepoint with a deterministic-but-different variation
87/// selector drawn from the supplementary range `U+E0100..=U+E01EF`.
88///
89/// Useful when a WAF strips a constant pad (U+FE0F) but allows the
90/// supplementary plane — exposing the discrepancy directly.
91#[must_use]
92pub fn variation_selector_supplementary_pad(input: &str) -> String {
93    let mut out = String::with_capacity(input.len() * 5);
94    for (i, c) in (0_u32..).zip(input.chars()) {
95        out.push(c);
96        let sel_cp = 0xE0100 + (i % 0xF0);
97        if let Some(sel) = char::from_u32(sel_cp) {
98            out.push(sel);
99        }
100    }
101    out
102}
103
104/// Replace canonical ligature digraphs with their precomposed
105/// stylistic ligature codepoints (U+FB00..=U+FB06).
106///
107/// Defeats keyword filters that pre-NFKC and don't fold these.
108/// NFKC normalization recovers the plain ASCII so the origin
109/// parses identically.
110#[must_use]
111pub fn ligature_encode(input: &str) -> String {
112    // Order matters — longer matches must be tried first so `ffi` /
113    // `ffl` don't get partially consumed as `ff`.
114    const LIGATURES: &[(&str, char)] = &[
115        ("ffi", '\u{FB03}'),
116        ("ffl", '\u{FB04}'),
117        ("ff", '\u{FB00}'),
118        ("fi", '\u{FB01}'),
119        ("fl", '\u{FB02}'),
120        ("st", '\u{FB06}'),
121        ("ſt", '\u{FB05}'),
122    ];
123    let mut out = String::with_capacity(input.len());
124    let mut rest = input;
125    'outer: while !rest.is_empty() {
126        for &(pat, replacement) in LIGATURES {
127            if let Some(stripped) = rest.strip_prefix(pat) {
128                out.push(replacement);
129                rest = stripped;
130                continue 'outer;
131            }
132        }
133        // No ligature at this position — copy one codepoint and advance.
134        let mut chars = rest.chars();
135        if let Some(c) = chars.next() {
136            out.push(c);
137        }
138        rest = chars.as_str();
139    }
140    out
141}
142
143/// Replace every ASCII letter with its circled compatibility-equivalent
144/// (U+24B6..=U+24CF for uppercase, U+24D0..=U+24E9 for lowercase).
145///
146/// NFKC decomposes these back to the plain Latin letters. Same trick
147/// shape as `fullwidth_encode` but a non-overlapping codepoint set —
148/// rotating between them defeats any filter that scrubs ONE of them.
149#[must_use]
150pub fn circled_letter_encode(input: &str) -> String {
151    let mut out = String::with_capacity(input.len() * 4);
152    for c in input.chars() {
153        match c {
154            'A'..='Z' => {
155                let off = (c as u32) - ('A' as u32);
156                if let Some(repl) = char::from_u32(0x24B6 + off) {
157                    out.push(repl);
158                    continue;
159                }
160            }
161            'a'..='z' => {
162                let off = (c as u32) - ('a' as u32);
163                if let Some(repl) = char::from_u32(0x24D0 + off) {
164                    out.push(repl);
165                    continue;
166                }
167            }
168            _ => {}
169        }
170        out.push(c);
171    }
172    out
173}
174
175/// Replace every ASCII letter with its parenthesized
176/// compatibility-equivalent (U+1F110..=U+1F12B for uppercase,
177/// U+249C..=U+24B5 for lowercase).
178///
179/// Another rotation partner for `circled_letter_encode` /
180/// `fullwidth_encode` — the byte stream looks entirely different
181/// even though NFKC collapses all three back to the same ASCII.
182#[must_use]
183pub fn parenthesized_letter_encode(input: &str) -> String {
184    let mut out = String::with_capacity(input.len() * 4);
185    for c in input.chars() {
186        match c {
187            'A'..='Z' => {
188                let off = (c as u32) - ('A' as u32);
189                if let Some(repl) = char::from_u32(0x1F110 + off) {
190                    out.push(repl);
191                    continue;
192                }
193            }
194            'a'..='z' => {
195                let off = (c as u32) - ('a' as u32);
196                if let Some(repl) = char::from_u32(0x249C + off) {
197                    out.push(repl);
198                    continue;
199                }
200            }
201            _ => {}
202        }
203        out.push(c);
204    }
205    out
206}
207
208/// Inject U+00AD SOFT HYPHEN between every pair of codepoints.
209///
210/// Visually invisible; many WAFs don't strip it because U+00AD is a
211/// valid Latin-1 character. Backends that don't fold it see a string
212/// that's no longer the keyword.
213#[must_use]
214pub fn soft_hyphen_inject(input: &str) -> String {
215    // §1 SPEED: replaced Vec<char> collect (heap allocation proportional to
216    // input length) + two-pass enumerate with a single-pass peekable iterator.
217    // The `first` flag replaces the `i > 0` guard without materialising the
218    // full char vec — zero extra allocation beyond the output String.
219    //
220    // Before: 2 heap allocs (Vec + String), O(n) collect, O(n) enumerate.
221    // After:  1 heap alloc (String), O(n) single pass.
222    if input.is_empty() {
223        return String::new();
224    }
225    // U+00AD is 2 bytes in UTF-8; pre-size for N chars + (N-1) soft-hyphens.
226    let char_count = input.chars().count();
227    let mut out = String::with_capacity(input.len() + (char_count.saturating_sub(1)) * 2);
228    let mut first = true;
229    for c in input.chars() {
230        if !first {
231            out.push('\u{00AD}');
232        }
233        first = false;
234        out.push(c);
235    }
236    out
237}
238
239/// Wrap each codepoint in U+2060 WORD JOINER.
240///
241/// Zero-width, NFC-stable (does NOT get folded by NFC), but NFKC
242/// strips it. Splits the difference between `zero_width_inject` and
243/// `variation_selector_pad`.
244#[must_use]
245pub fn word_joiner_wrap(input: &str) -> String {
246    let mut out = String::with_capacity(input.len() * 4);
247    for c in input.chars() {
248        out.push('\u{2060}');
249        out.push(c);
250    }
251    out.push('\u{2060}');
252    out
253}
254
255/// Returns the list of every invisible-class encoder name shipped by
256/// this module — used by the integration test to assert the
257/// dispatcher in `strategy.rs` has wired every one of them.
258pub const INVISIBLE_ENCODER_NAMES: &[&str] = &[
259    "tag_char_encode",
260    "variation_selector_pad",
261    "variation_selector_supplementary_pad",
262    "ligature_encode",
263    "circled_letter_encode",
264    "parenthesized_letter_encode",
265    "soft_hyphen_inject",
266    "word_joiner_wrap",
267];
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    #[test]
274    fn tag_char_round_trips_via_codepoint_subtraction() {
275        let encoded = tag_char_encode("SELECT");
276        let recovered: String = encoded
277            .chars()
278            .map(|c| {
279                let cp = c as u32;
280                if (0xE0000..=0xE007F).contains(&cp) {
281                    char::from_u32(cp - 0xE0000).unwrap_or(c)
282                } else {
283                    c
284                }
285            })
286            .collect();
287        assert_eq!(recovered, "SELECT");
288    }
289
290    #[test]
291    fn tag_char_preserves_non_ascii() {
292        let encoded = tag_char_encode("SELECT' OR Ä");
293        assert!(
294            encoded.contains('Ä'),
295            "non-ASCII passes through: {encoded:?}"
296        );
297    }
298
299    #[test]
300    fn tag_char_every_byte_changes() {
301        let raw = "SELECT";
302        let encoded = tag_char_encode(raw);
303        assert_ne!(raw, encoded);
304        // Every encoded codepoint must be in plane 14, not ASCII.
305        for c in encoded.chars() {
306            let cp = c as u32;
307            assert!((0xE0000..=0xE007F).contains(&cp), "non-tag codepoint: {c}");
308        }
309    }
310
311    #[test]
312    fn tag_char_handles_empty() {
313        assert_eq!(tag_char_encode(""), "");
314    }
315
316    #[test]
317    fn variation_selector_default_is_fe0f() {
318        let out = variation_selector_pad("AB", '\u{FE0F}');
319        assert!(out.contains('\u{FE0F}'));
320        assert_eq!(out.chars().count(), 4); // A, FE0F, B, FE0F
321    }
322
323    #[test]
324    fn variation_selector_invalid_falls_back_to_fe0f() {
325        let out = variation_selector_pad("X", 'a');
326        assert!(out.contains('\u{FE0F}'), "fallback selector: {out:?}");
327    }
328
329    #[test]
330    fn variation_selector_accepts_supplementary_range() {
331        let out = variation_selector_pad("X", '\u{E0100}');
332        assert!(out.contains('\u{E0100}'));
333    }
334
335    #[test]
336    fn variation_selector_supplementary_varies_per_position() {
337        let out = variation_selector_supplementary_pad("AB");
338        let selectors: Vec<char> = out
339            .chars()
340            .filter(|c| (0xE0100..=0xE01EF).contains(&(*c as u32)))
341            .collect();
342        assert_eq!(selectors.len(), 2);
343        assert_ne!(
344            selectors[0], selectors[1],
345            "selectors must differ per position"
346        );
347    }
348
349    #[test]
350    fn ligature_encode_replaces_known_digraphs() {
351        // "effect"  → ef·ff·ect — `ff` not followed by `i`/`l`, so ff (U+FB00).
352        // "official" → o·ffi·cial — `ffi` matches before `ff`, so ffi (U+FB03).
353        // "offload"  → o·ffl·oad — `ffl` matches before `ff`, so ffl (U+FB04).
354        let out = ligature_encode("effect official offload");
355        assert!(out.contains('\u{FB00}'), "ff → ff in 'effect': {out:?}");
356        assert!(out.contains('\u{FB03}'), "ffi → ffi in 'official': {out:?}");
357        assert!(out.contains('\u{FB04}'), "ffl → ffl in 'offload': {out:?}");
358    }
359
360    #[test]
361    fn ligature_encode_prefers_longest_match() {
362        // `ffi` must be matched as one ligature, not `ff` + `i`.
363        let out = ligature_encode("ffi");
364        assert_eq!(out, "\u{FB03}");
365        assert!(!out.contains('\u{FB00}'));
366    }
367
368    #[test]
369    fn ligature_encode_passes_unmatched_chars() {
370        let out = ligature_encode("axyz");
371        assert_eq!(out, "axyz");
372    }
373
374    #[test]
375    fn ligature_encode_handles_empty() {
376        assert_eq!(ligature_encode(""), "");
377    }
378
379    #[test]
380    fn circled_letter_uppercase_and_lowercase() {
381        let out = circled_letter_encode("Aa");
382        assert!(out.contains('\u{24B6}'), "A → Ⓐ: {out:?}");
383        assert!(out.contains('\u{24D0}'), "a → ⓐ: {out:?}");
384    }
385
386    #[test]
387    fn circled_letter_preserves_punctuation() {
388        let out = circled_letter_encode("A'B");
389        assert!(out.contains('\''), "quote preserved: {out:?}");
390    }
391
392    #[test]
393    fn parenthesized_letter_uppercase_and_lowercase() {
394        let out = parenthesized_letter_encode("Bb");
395        assert!(out.contains('\u{1F111}'), "B → 🄑: {out:?}");
396        assert!(out.contains('\u{249D}'), "b → ⒝: {out:?}");
397    }
398
399    #[test]
400    fn circled_and_parenthesized_produce_different_bytes() {
401        let raw = "SELECT";
402        let circled = circled_letter_encode(raw);
403        let parens = parenthesized_letter_encode(raw);
404        assert_ne!(
405            circled, parens,
406            "rotation partners must produce distinct byte streams"
407        );
408    }
409
410    #[test]
411    fn soft_hyphen_inject_between_each_pair() {
412        let out = soft_hyphen_inject("ABC");
413        // Expect: A, U+00AD, B, U+00AD, C
414        let count = out.chars().filter(|&c| c == '\u{00AD}').count();
415        assert_eq!(count, 2, "soft hyphen between each pair: {out:?}");
416    }
417
418    #[test]
419    fn soft_hyphen_inject_empty_is_empty() {
420        assert_eq!(soft_hyphen_inject(""), "");
421    }
422
423    #[test]
424    fn soft_hyphen_inject_single_char_unchanged() {
425        assert_eq!(soft_hyphen_inject("A"), "A");
426    }
427
428    #[test]
429    fn word_joiner_wraps_both_ends() {
430        let out = word_joiner_wrap("AB");
431        let count = out.chars().filter(|&c| c == '\u{2060}').count();
432        // Before A, between A-B, after B.
433        assert_eq!(count, 3, "wrap with joiner at each boundary: {out:?}");
434    }
435
436    #[test]
437    fn all_encoders_preserve_utf8_validity() {
438        let payload = "' OR 1=1 -- SELECT * FROM users";
439        let encoders: &[fn(&str) -> String] = &[
440            tag_char_encode,
441            |s| variation_selector_pad(s, '\u{FE0F}'),
442            variation_selector_supplementary_pad,
443            ligature_encode,
444            circled_letter_encode,
445            parenthesized_letter_encode,
446            soft_hyphen_inject,
447            word_joiner_wrap,
448        ];
449        for (i, enc) in encoders.iter().enumerate() {
450            let out = enc(payload);
451            // Must be valid UTF-8 (String guarantees this, but assert
452            // length-positive on non-empty input).
453            assert!(
454                !out.is_empty(),
455                "encoder #{i} produced empty on non-empty input"
456            );
457        }
458    }
459
460    #[test]
461    fn all_encoders_are_deterministic() {
462        let payload = "SELECT' OR 1=1";
463        let encoders: &[fn(&str) -> String] = &[
464            tag_char_encode,
465            |s| variation_selector_pad(s, '\u{FE0F}'),
466            variation_selector_supplementary_pad,
467            ligature_encode,
468            circled_letter_encode,
469            parenthesized_letter_encode,
470            soft_hyphen_inject,
471            word_joiner_wrap,
472        ];
473        for enc in encoders {
474            assert_eq!(enc(payload), enc(payload), "encoder must be deterministic");
475        }
476    }
477
478    #[test]
479    fn all_encoders_handle_empty_input() {
480        let encoders: &[fn(&str) -> String] = &[
481            tag_char_encode,
482            |s| variation_selector_pad(s, '\u{FE0F}'),
483            variation_selector_supplementary_pad,
484            ligature_encode,
485            circled_letter_encode,
486            parenthesized_letter_encode,
487            soft_hyphen_inject,
488            word_joiner_wrap,
489        ];
490        for enc in encoders {
491            let out = enc("");
492            // soft_hyphen_inject and word_joiner_wrap-empty special case:
493            // word_joiner_wrap("") still produces a single U+2060.
494            // That's fine — it preserves the "wrap" invariant.
495            assert!(out.len() < 8, "empty input must produce ~empty output");
496        }
497    }
498
499    #[test]
500    fn invisible_encoder_names_match_pub_fns() {
501        // Smoke: the published name list is non-empty and contains
502        // every encoder we exposed. If a developer adds a pub fn but
503        // forgets to register it in INVISIBLE_ENCODER_NAMES, this
504        // test fires.
505        assert_eq!(INVISIBLE_ENCODER_NAMES.len(), 8);
506        for name in INVISIBLE_ENCODER_NAMES {
507            assert!(!name.is_empty());
508            assert!(
509                name.chars().all(|c| c.is_ascii_lowercase() || c == '_'),
510                "encoder names must be snake_case: {name}"
511            );
512        }
513    }
514
515    #[test]
516    fn adversarial_large_input_does_not_panic() {
517        let big = "A".repeat(10_000);
518        let _ = tag_char_encode(&big);
519        let _ = variation_selector_pad(&big, '\u{FE0F}');
520        let _ = variation_selector_supplementary_pad(&big);
521        let _ = ligature_encode(&big);
522        let _ = circled_letter_encode(&big);
523        let _ = parenthesized_letter_encode(&big);
524        let _ = soft_hyphen_inject(&big);
525        let _ = word_joiner_wrap(&big);
526    }
527
528    #[test]
529    fn unicode_input_round_trip_safe() {
530        let payload = "Ä' OR ñ=1 -- 日本";
531        let encoders: &[fn(&str) -> String] = &[
532            tag_char_encode,
533            |s| variation_selector_pad(s, '\u{FE0F}'),
534            ligature_encode,
535            circled_letter_encode,
536            parenthesized_letter_encode,
537            soft_hyphen_inject,
538            word_joiner_wrap,
539        ];
540        for enc in encoders {
541            let out = enc(payload);
542            // Non-ASCII payload chars must survive (encoders only touch
543            // ASCII or known digraphs).
544            assert!(out.contains('日') || out.contains('Ä') || out.contains('ñ'));
545        }
546    }
547}