Skip to main content

llmtrace_security/
normalise.rs

1//! Unicode normalisation layer for security analysis.
2//!
3//! This module provides text normalisation as a preprocessing step before all
4//! security analysis.  It applies a multi-stage pipeline to defeat Unicode-based
5//! evasion techniques:
6//!
7//! 1. **NFKC normalisation** — compatibility decomposition + canonical composition
8//! 2. **Diacritics stripping** — removes combining marks to defeat accent evasion
9//!    (IS-031)
10//! 3. **Invisible character stripping** — removes zero-width, tag, and control
11//!    characters (IS-022)
12//! 4. **Homoglyph mapping** — maps Cyrillic, Greek, upside-down, and Braille
13//!    characters to ASCII equivalents (IS-021, IS-015)
14//! 5. **Emoji stripping** — removes emoji to defeat emoji-smuggling attacks
15//!    (IS-020)
16//!
17//! # Why?
18//!
19//! Attackers can bypass regex-based detection by using visually identical but
20//! distinct Unicode code points — for example, Cyrillic `а` (U+0430) instead
21//! of Latin `a` (U+0061), embedding zero-width characters inside keywords,
22//! using upside-down letters, encoding text in Braille, adding diacritics, or
23//! interspersing emoji characters.  Normalising text before analysis neutralises
24//! these evasion techniques.
25
26use unicode_normalization::UnicodeNormalization;
27
28/// Characters that are zero-width or invisible and should be stripped.
29const ZERO_WIDTH_CHARS: &[char] = &[
30    '\u{200B}', // Zero-width space
31    '\u{200C}', // Zero-width non-joiner
32    '\u{200D}', // Zero-width joiner
33    '\u{FEFF}', // BOM / zero-width no-break space
34    '\u{00AD}', // Soft hyphen
35    '\u{2060}', // Word joiner
36    '\u{2028}', // Line separator
37    '\u{2029}', // Paragraph separator
38    // Bidirectional control characters (U+202A-U+202E)
39    '\u{202A}', // Left-to-right embedding
40    '\u{202B}', // Right-to-left embedding
41    '\u{202C}', // Pop directional formatting
42    '\u{202D}', // Left-to-right override
43    '\u{202E}', // Right-to-left override
44    // Bidirectional isolate characters (U+2066-U+2069)
45    '\u{2066}', // Left-to-right isolate
46    '\u{2067}', // Right-to-left isolate
47    '\u{2068}', // First strong isolate
48    '\u{2069}', // Pop directional isolate
49];
50
51/// Normalise text for security analysis.
52///
53/// This function applies a multi-stage normalisation pipeline:
54/// 1. NFKC normalisation (compatibility decomposition + canonical composition)
55/// 2. Diacritics stripping via NFD decomposition and combining mark removal
56/// 3. Zero-width, invisible, and Unicode tag character stripping
57/// 4. Homoglyph mapping (Cyrillic, Greek, upside-down text, Braille → ASCII)
58/// 5. Emoji stripping
59///
60/// # Examples
61///
62/// ```
63/// use llmtrace_security::normalise::normalise_text;
64///
65/// // NFKC normalisation: fullwidth "A" → "A"
66/// assert_eq!(normalise_text("\u{FF21}"), "A");
67///
68/// // Zero-width stripping
69/// assert_eq!(normalise_text("he\u{200B}llo"), "hello");
70///
71/// // Homoglyph mapping: Cyrillic "а" → Latin "a"
72/// assert_eq!(normalise_text("\u{0430}"), "a");
73///
74/// // Diacritics stripping: "café" → "cafe"
75/// assert_eq!(normalise_text("caf\u{00E9}"), "cafe");
76///
77/// // Emoji stripping: "he😀llo" → "hello"
78/// assert_eq!(normalise_text("he\u{1F600}llo"), "hello");
79/// ```
80pub fn normalise_text(input: &str) -> String {
81    // Step 1: NFKC normalisation
82    let nfkc: String = input.nfkc().collect();
83
84    // Step 2: Strip diacritics (NFD decomposition + combining mark removal)
85    let without_diacritics = strip_diacritics(&nfkc);
86
87    // Step 3: Strip zero-width, invisible, and tag characters
88    let stripped: String = without_diacritics
89        .chars()
90        .filter(|c| !ZERO_WIDTH_CHARS.contains(c) && !is_tag_character(*c))
91        .collect();
92
93    // Step 4: Map homoglyphs to ASCII equivalents
94    let mapped: String = stripped.chars().map(map_homoglyph).collect();
95
96    // Step 5: Strip emoji characters
97    strip_emoji(&mapped)
98}
99
100/// Strip emoji characters from text.
101///
102/// Removes characters in standard Unicode emoji ranges including emoticons,
103/// pictographs, transport symbols, dingbats, variation selectors, and skin
104/// tone modifiers.  Emoji are removed entirely (not replaced with spaces) to
105/// prevent attackers from using them as word separators to bypass detection.
106///
107/// # Examples
108///
109/// ```
110/// use llmtrace_security::normalise::strip_emoji;
111///
112/// assert_eq!(strip_emoji("hello 🌍 world"), "hello  world");
113/// assert_eq!(strip_emoji("ig🔥no📌re"), "ignore");
114/// ```
115pub fn strip_emoji(input: &str) -> String {
116    input.chars().filter(|c| !is_emoji(*c)).collect()
117}
118
119/// Strip diacritics (combining marks) from text.
120///
121/// Applies NFD (canonical decomposition) to separate base characters from
122/// combining marks, then removes all combining marks.  This converts accented
123/// characters to their base forms (e.g., "é" → "e", "ñ" → "n").
124///
125/// # Examples
126///
127/// ```
128/// use llmtrace_security::normalise::strip_diacritics;
129///
130/// assert_eq!(strip_diacritics("café"), "cafe");
131/// assert_eq!(strip_diacritics("résumé"), "resume");
132/// assert_eq!(strip_diacritics("naïve"), "naive");
133/// ```
134pub fn strip_diacritics(input: &str) -> String {
135    input.nfd().filter(|c| !is_combining_mark(*c)).collect()
136}
137
138/// Returns `true` if the character is an emoji.
139///
140/// Covers standard Unicode emoji ranges: emoticons, miscellaneous symbols,
141/// transport/map symbols, alchemical symbols, geometric shapes extended,
142/// supplemental arrows, dingbats, variation selectors, and skin tone modifiers.
143fn is_emoji(c: char) -> bool {
144    let cp = c as u32;
145    matches!(
146        cp,
147        0x1F600..=0x1F64F   // Emoticons
148        | 0x1F300..=0x1F5FF // Misc Symbols and Pictographs
149        | 0x1F680..=0x1F6FF // Transport and Map Symbols
150        | 0x1F700..=0x1F77F // Alchemical Symbols
151        | 0x1F780..=0x1F7FF // Geometric Shapes Extended
152        | 0x1F800..=0x1F8FF // Supplemental Arrows-C
153        | 0x1F900..=0x1F9FF // Supplemental Symbols and Pictographs
154        | 0x1FA00..=0x1FA6F // Chess Symbols
155        | 0x1FA70..=0x1FAFF // Symbols and Pictographs Extended-A
156        | 0x2600..=0x26FF   // Miscellaneous Symbols
157        | 0x2700..=0x27BF   // Dingbats
158        | 0xFE00..=0xFE0F // Variation Selectors
159                          // Skin tone modifiers (U+1F3FB–U+1F3FF) are covered by
160                          // Misc Symbols and Pictographs (U+1F300–U+1F5FF) above.
161    )
162}
163
164/// Returns `true` if the character is a Unicode combining mark.
165///
166/// Covers the principal combining diacritical mark blocks used to add accents
167/// and other modifications to base characters.
168fn is_combining_mark(c: char) -> bool {
169    let cp = c as u32;
170    matches!(
171        cp,
172        0x0300..=0x036F   // Combining Diacritical Marks
173        | 0x0483..=0x0489 // Combining Cyrillic
174        | 0x1AB0..=0x1AFF // Combining Diacritical Marks Extended
175        | 0x1DC0..=0x1DFF // Combining Diacritical Marks Supplement
176        | 0x20D0..=0x20FF // Combining Diacritical Marks for Symbols
177        | 0xFE20..=0xFE2F // Combining Half Marks
178    )
179}
180
181/// Returns `true` if the character is a Unicode tag character.
182///
183/// Tag characters (U+E0001–U+E007F) duplicate ASCII but are invisible.  They
184/// were designed for language tagging but can be exploited to smuggle hidden
185/// text through LLM pipelines.
186fn is_tag_character(c: char) -> bool {
187    let cp = c as u32;
188    (0xE0001..=0xE007F).contains(&cp)
189}
190
191/// Map a single character to its ASCII equivalent if it is a known homoglyph.
192///
193/// Covers the most common Cyrillic-to-Latin confusables, Greek confusables,
194/// upside-down (flipped) Latin letters, and Braille Grade 1 letter patterns.
195fn map_homoglyph(c: char) -> char {
196    match c {
197        // =================================================================
198        // Cyrillic → Latin (lowercase)
199        // =================================================================
200        '\u{0430}' => 'a', // Cyrillic а
201        '\u{0435}' => 'e', // Cyrillic е
202        '\u{043E}' => 'o', // Cyrillic о
203        '\u{0440}' => 'p', // Cyrillic р
204        '\u{0441}' => 'c', // Cyrillic с
205        '\u{0445}' => 'x', // Cyrillic х
206        '\u{0443}' => 'y', // Cyrillic у
207        '\u{0456}' => 'i', // Cyrillic і (Ukrainian i)
208        '\u{0458}' => 'j', // Cyrillic ј
209        '\u{04BB}' => 'h', // Cyrillic һ
210
211        // =================================================================
212        // Cyrillic → Latin (uppercase)
213        // =================================================================
214        '\u{0410}' => 'A', // Cyrillic А
215        '\u{0412}' => 'B', // Cyrillic В
216        '\u{0415}' => 'E', // Cyrillic Е
217        '\u{041A}' => 'K', // Cyrillic К
218        '\u{041C}' => 'M', // Cyrillic М
219        '\u{041D}' => 'H', // Cyrillic Н
220        '\u{041E}' => 'O', // Cyrillic О
221        '\u{0420}' => 'P', // Cyrillic Р
222        '\u{0421}' => 'C', // Cyrillic С
223        '\u{0422}' => 'T', // Cyrillic Т
224        '\u{0425}' => 'X', // Cyrillic Х
225
226        // =================================================================
227        // Greek → Latin
228        // =================================================================
229        '\u{03BF}' => 'o', // Greek omicron ο
230        '\u{03B1}' => 'a', // Greek alpha α (after NFKC, still distinct)
231        '\u{0391}' => 'A', // Greek Alpha Α
232        '\u{0392}' => 'B', // Greek Beta Β
233        '\u{0395}' => 'E', // Greek Epsilon Ε
234        '\u{0396}' => 'Z', // Greek Zeta Ζ
235        '\u{0397}' => 'H', // Greek Eta Η
236        '\u{0399}' => 'I', // Greek Iota Ι
237        '\u{039A}' => 'K', // Greek Kappa Κ
238        '\u{039B}' => 'V', // Greek Lambda Λ (upside-down V)
239        '\u{039C}' => 'M', // Greek Mu Μ
240        '\u{039D}' => 'N', // Greek Nu Ν
241        '\u{039F}' => 'O', // Greek Omicron Ο
242        '\u{03A1}' => 'P', // Greek Rho Ρ
243        '\u{03A4}' => 'T', // Greek Tau Τ
244        '\u{03A5}' => 'Y', // Greek Upsilon Υ
245        '\u{03A7}' => 'X', // Greek Chi Χ
246
247        // =================================================================
248        // Upside-down / flipped Latin (lowercase)  — IS-021
249        // =================================================================
250        '\u{0250}' => 'a', // ɐ  (turned a)
251        '\u{0254}' => 'c', // ɔ  (open o / turned c)
252        '\u{01DD}' => 'e', // ǝ  (turned e)
253        '\u{025F}' => 'f', // ɟ  (dotless j with stroke / turned f)
254        '\u{0183}' => 'g', // ƃ  (b with topbar / turned g)
255        '\u{0265}' => 'h', // ɥ  (turned h)
256        '\u{0131}' => 'i', // ı  (dotless i)
257        '\u{027E}' => 'j', // ɾ  (r with fishhook / turned j)
258        '\u{029E}' => 'k', // ʞ  (turned k)
259        '\u{026F}' => 'm', // ɯ  (turned m)
260        '\u{0279}' => 'r', // ɹ  (turned r)
261        '\u{0287}' => 't', // ʇ  (turned t)
262        '\u{028C}' => 'v', // ʌ  (turned v / caret)
263        '\u{028D}' => 'w', // ʍ  (turned w)
264        '\u{028E}' => 'y', // ʎ  (turned y)
265
266        // =================================================================
267        // Upside-down / flipped Latin (uppercase)  — IS-021
268        //
269        // NOTE: Characters handled by NFKC are omitted to avoid dead arms:
270        //   Ⅎ (U+2132) → F,  ⅁ (U+2141) → G,  ⅄ (U+2144) → Y
271        //   ſ (U+017F) → s   (NFKC; task specifies J but NFKC wins)
272        // =================================================================
273        '\u{2200}' => 'A', // ∀  (for-all / turned A)
274        '\u{15FA}' => 'B', // ᗺ  (Canadian Syllabics Carrier SI / turned B)
275        '\u{0186}' => 'C', // Ɔ  (open O / turned C)
276        '\u{15E1}' => 'D', // ᗡ  (Canadian Syllabics Carrier THE / turned D)
277        '\u{018E}' => 'E', // Ǝ  (reversed E)
278        '\u{02E5}' => 'L', // ˥  (modifier letter extra-high tone bar / turned L)
279        '\u{0500}' => 'P', // Ԁ  (Cyrillic Komi De / turned P)
280        '\u{1D1A}' => 'R', // ᴚ  (Latin letter small capital turned R)
281        '\u{22A5}' => 'T', // ⊥  (up tack / turned T)
282        '\u{2229}' => 'U', // ∩  (intersection / turned U)
283
284        // =================================================================
285        // Braille Grade 1 → ASCII  — IS-015
286        //
287        // Standard Braille encoding where each dot pattern maps to a letter.
288        // U+2800 (blank) maps to space.
289        // =================================================================
290        '\u{2800}' => ' ', // ⠀  (blank)
291        '\u{2801}' => 'a', // ⠁  (dot 1)
292        '\u{2803}' => 'b', // ⠃  (dots 1-2)
293        '\u{2809}' => 'c', // ⠉  (dots 1-4)
294        '\u{2819}' => 'd', // ⠙  (dots 1-4-5)
295        '\u{2811}' => 'e', // ⠑  (dots 1-5)
296        '\u{280B}' => 'f', // ⠋  (dots 1-2-4)
297        '\u{281B}' => 'g', // ⠛  (dots 1-2-4-5)
298        '\u{2813}' => 'h', // ⠓  (dots 1-2-5)
299        '\u{280A}' => 'i', // ⠊  (dots 2-4)
300        '\u{281A}' => 'j', // ⠚  (dots 2-4-5)
301        '\u{2805}' => 'k', // ⠅  (dots 1-3)
302        '\u{2807}' => 'l', // ⠇  (dots 1-2-3)
303        '\u{280D}' => 'm', // ⠍  (dots 1-3-4)
304        '\u{281D}' => 'n', // ⠝  (dots 1-3-4-5)
305        '\u{2815}' => 'o', // ⠕  (dots 1-3-5)
306        '\u{280F}' => 'p', // ⠏  (dots 1-2-3-4)
307        '\u{281F}' => 'q', // ⠟  (dots 1-2-3-4-5)
308        '\u{2817}' => 'r', // ⠗  (dots 1-2-3-5)
309        '\u{280E}' => 's', // ⠎  (dots 2-3-4)
310        '\u{281E}' => 't', // ⠞  (dots 2-3-4-5)
311        '\u{2825}' => 'u', // ⠥  (dots 1-3-6)
312        '\u{2827}' => 'v', // ⠧  (dots 1-2-3-6)
313        '\u{283A}' => 'w', // ⠺  (dots 2-4-5-6)
314        '\u{282D}' => 'x', // ⠭  (dots 1-3-4-6)
315        '\u{283D}' => 'y', // ⠽  (dots 1-3-4-5-6)
316        '\u{2835}' => 'z', // ⠵  (dots 1-3-5-6)
317
318        _ => c,
319    }
320}
321
322// ===========================================================================
323// Tests
324// ===========================================================================
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    // -- NFKC normalisation ------------------------------------------------
331
332    #[test]
333    fn test_nfkc_fullwidth_to_ascii() {
334        // Fullwidth "HELLO" → "HELLO"
335        assert_eq!(
336            normalise_text("\u{FF28}\u{FF25}\u{FF2C}\u{FF2C}\u{FF2F}"),
337            "HELLO"
338        );
339    }
340
341    #[test]
342    fn test_nfkc_superscript_digits() {
343        // Superscript "²" → "2"
344        assert_eq!(normalise_text("\u{00B2}"), "2");
345    }
346
347    #[test]
348    fn test_nfkc_ligature_fi() {
349        // Ligature "fi" → "fi"
350        assert_eq!(normalise_text("\u{FB01}"), "fi");
351    }
352
353    #[test]
354    fn test_nfkc_roman_numeral() {
355        // Roman numeral Ⅳ (U+2163) → "IV"
356        assert_eq!(normalise_text("\u{2163}"), "IV");
357    }
358
359    #[test]
360    fn test_nfkc_preserves_normal_ascii() {
361        let text = "Hello, world! 123";
362        assert_eq!(normalise_text(text), text);
363    }
364
365    // -- Zero-width character stripping ------------------------------------
366
367    #[test]
368    fn test_strip_zero_width_space() {
369        assert_eq!(normalise_text("ig\u{200B}nore"), "ignore");
370    }
371
372    #[test]
373    fn test_strip_zero_width_non_joiner() {
374        assert_eq!(normalise_text("in\u{200C}structions"), "instructions");
375    }
376
377    #[test]
378    fn test_strip_zero_width_joiner() {
379        assert_eq!(normalise_text("pr\u{200D}ompt"), "prompt");
380    }
381
382    #[test]
383    fn test_strip_bom() {
384        assert_eq!(normalise_text("\u{FEFF}hello"), "hello");
385    }
386
387    #[test]
388    fn test_strip_soft_hyphen() {
389        assert_eq!(normalise_text("ig\u{00AD}nore"), "ignore");
390    }
391
392    #[test]
393    fn test_strip_word_joiner() {
394        assert_eq!(normalise_text("sys\u{2060}tem"), "system");
395    }
396
397    #[test]
398    fn test_strip_line_separator() {
399        assert_eq!(normalise_text("a\u{2028}b"), "ab");
400    }
401
402    #[test]
403    fn test_strip_paragraph_separator() {
404        assert_eq!(normalise_text("a\u{2029}b"), "ab");
405    }
406
407    #[test]
408    fn test_strip_bidi_controls() {
409        let input = "\u{202A}system\u{202C}: override\u{202E}";
410        assert_eq!(normalise_text(input), "system: override");
411    }
412
413    #[test]
414    fn test_strip_bidi_isolates() {
415        let input = "\u{2066}ignore\u{2069} previous";
416        assert_eq!(normalise_text(input), "ignore previous");
417    }
418
419    #[test]
420    fn test_strip_multiple_zero_width_in_keyword() {
421        // "i\u{200B}g\u{200C}n\u{200D}o\u{FEFF}re" → "ignore"
422        assert_eq!(
423            normalise_text("i\u{200B}g\u{200C}n\u{200D}o\u{FEFF}re"),
424            "ignore"
425        );
426    }
427
428    // -- Unicode tag character stripping (IS-022) --------------------------
429
430    #[test]
431    fn test_strip_tag_language_tag() {
432        // U+E0001 (LANGUAGE TAG) should be stripped
433        assert_eq!(normalise_text("hello\u{E0001}world"), "helloworld");
434    }
435
436    #[test]
437    fn test_strip_tag_characters_range() {
438        // Tag characters U+E0020–U+E007E embed invisible ASCII-equivalent text
439        let input = "safe\u{E0069}\u{E0067}\u{E006E}\u{E006F}\u{E0072}\u{E0065}text";
440        assert_eq!(normalise_text(input), "safetext");
441    }
442
443    #[test]
444    fn test_strip_tag_cancel_tag() {
445        // U+E007F (CANCEL TAG) should also be stripped
446        assert_eq!(normalise_text("a\u{E007F}b"), "ab");
447    }
448
449    #[test]
450    fn test_strip_all_tag_range() {
451        // Ensure the full tag range U+E0001–U+E007F is stripped
452        let mut input = String::from("start");
453        for cp in 0xE0001..=0xE007Fu32 {
454            if let Some(c) = char::from_u32(cp) {
455                input.push(c);
456            }
457        }
458        input.push_str("end");
459        assert_eq!(normalise_text(&input), "startend");
460    }
461
462    // -- Homoglyph mapping --------------------------------------------------
463
464    #[test]
465    fn test_cyrillic_a_to_latin_a() {
466        assert_eq!(normalise_text("\u{0430}"), "a");
467    }
468
469    #[test]
470    fn test_cyrillic_e_to_latin_e() {
471        assert_eq!(normalise_text("\u{0435}"), "e");
472    }
473
474    #[test]
475    fn test_cyrillic_o_to_latin_o() {
476        assert_eq!(normalise_text("\u{043E}"), "o");
477    }
478
479    #[test]
480    fn test_cyrillic_p_to_latin_p() {
481        assert_eq!(normalise_text("\u{0440}"), "p");
482    }
483
484    #[test]
485    fn test_cyrillic_c_to_latin_c() {
486        assert_eq!(normalise_text("\u{0441}"), "c");
487    }
488
489    #[test]
490    fn test_mixed_script_homoglyph_attack() {
491        // "ignоre" with Cyrillic о (U+043E) → "ignore" with Latin o
492        let malicious = "ign\u{043E}re";
493        assert_eq!(normalise_text(malicious), "ignore");
494    }
495
496    #[test]
497    fn test_full_cyrillic_word_looks_like_ignore() {
498        // Cyrillic: і + g + n + о + r + е
499        let malicious = "\u{0456}gnor\u{0435}";
500        assert_eq!(normalise_text(malicious), "ignore");
501    }
502
503    #[test]
504    fn test_cyrillic_uppercase_confusables() {
505        // Cyrillic А, С, Е, О, Р → Latin A, C, E, O, P
506        let text = "\u{0410}\u{0421}\u{0415}\u{041E}\u{0420}";
507        assert_eq!(normalise_text(text), "ACEOP");
508    }
509
510    #[test]
511    fn test_greek_omicron_to_latin_o() {
512        assert_eq!(normalise_text("\u{03BF}"), "o");
513    }
514
515    #[test]
516    fn test_greek_uppercase_confusables() {
517        // Greek Α, Β, Ε → Latin A, B, E
518        let text = "\u{0391}\u{0392}\u{0395}";
519        assert_eq!(normalise_text(text), "ABE");
520    }
521
522    // -- Upside-down text mapping (IS-021) ---------------------------------
523
524    #[test]
525    fn test_upside_down_individual_chars() {
526        assert_eq!(map_homoglyph('\u{0250}'), 'a'); // ɐ
527        assert_eq!(map_homoglyph('\u{0254}'), 'c'); // ɔ
528        assert_eq!(map_homoglyph('\u{01DD}'), 'e'); // ǝ
529        assert_eq!(map_homoglyph('\u{025F}'), 'f'); // ɟ
530        assert_eq!(map_homoglyph('\u{0183}'), 'g'); // ƃ
531        assert_eq!(map_homoglyph('\u{0265}'), 'h'); // ɥ
532        assert_eq!(map_homoglyph('\u{0131}'), 'i'); // ı
533        assert_eq!(map_homoglyph('\u{027E}'), 'j'); // ɾ
534        assert_eq!(map_homoglyph('\u{029E}'), 'k'); // ʞ
535        assert_eq!(map_homoglyph('\u{026F}'), 'm'); // ɯ
536        assert_eq!(map_homoglyph('\u{0279}'), 'r'); // ɹ
537        assert_eq!(map_homoglyph('\u{0287}'), 't'); // ʇ
538        assert_eq!(map_homoglyph('\u{028C}'), 'v'); // ʌ
539        assert_eq!(map_homoglyph('\u{028D}'), 'w'); // ʍ
540        assert_eq!(map_homoglyph('\u{028E}'), 'y'); // ʎ
541    }
542
543    #[test]
544    fn test_upside_down_uppercase_chars() {
545        assert_eq!(map_homoglyph('\u{2200}'), 'A'); // ∀
546        assert_eq!(map_homoglyph('\u{15FA}'), 'B'); // ᗺ
547        assert_eq!(map_homoglyph('\u{0186}'), 'C'); // Ɔ
548        assert_eq!(map_homoglyph('\u{15E1}'), 'D'); // ᗡ
549        assert_eq!(map_homoglyph('\u{018E}'), 'E'); // Ǝ
550        assert_eq!(map_homoglyph('\u{02E5}'), 'L'); // ˥
551        assert_eq!(map_homoglyph('\u{0500}'), 'P'); // Ԁ
552        assert_eq!(map_homoglyph('\u{1D1A}'), 'R'); // ᴚ
553        assert_eq!(map_homoglyph('\u{22A5}'), 'T'); // ⊥
554        assert_eq!(map_homoglyph('\u{2229}'), 'U'); // ∩
555        assert_eq!(map_homoglyph('\u{039B}'), 'V'); // Λ
556    }
557
558    #[test]
559    fn test_upside_down_word_hello() {
560        // "ɥǝllo" → "hello" (ɥ→h, ǝ→e, l→l, l→l, o→o)
561        assert_eq!(normalise_text("\u{0265}\u{01DD}llo"), "hello");
562    }
563
564    #[test]
565    fn test_upside_down_word_attack() {
566        // "ɐʇʇɐɔʞ" → "attack" (ɐ→a, ʇ→t, ʇ→t, ɐ→a, ɔ→c, ʞ→k)
567        assert_eq!(
568            normalise_text("\u{0250}\u{0287}\u{0287}\u{0250}\u{0254}\u{029E}"),
569            "attack"
570        );
571    }
572
573    #[test]
574    fn test_upside_down_word_text() {
575        // "ʇǝxʇ" → "text" (ʇ→t, ǝ→e, x→x, ʇ→t)
576        assert_eq!(normalise_text("\u{0287}\u{01DD}x\u{0287}"), "text");
577    }
578
579    // -- Braille-to-ASCII mapping (IS-015) ---------------------------------
580
581    #[test]
582    fn test_braille_individual_letters() {
583        assert_eq!(map_homoglyph('\u{2801}'), 'a');
584        assert_eq!(map_homoglyph('\u{2803}'), 'b');
585        assert_eq!(map_homoglyph('\u{2809}'), 'c');
586        assert_eq!(map_homoglyph('\u{2819}'), 'd');
587        assert_eq!(map_homoglyph('\u{2811}'), 'e');
588        assert_eq!(map_homoglyph('\u{280B}'), 'f');
589        assert_eq!(map_homoglyph('\u{281B}'), 'g');
590        assert_eq!(map_homoglyph('\u{2813}'), 'h');
591        assert_eq!(map_homoglyph('\u{280A}'), 'i');
592        assert_eq!(map_homoglyph('\u{281A}'), 'j');
593        assert_eq!(map_homoglyph('\u{2805}'), 'k');
594        assert_eq!(map_homoglyph('\u{2807}'), 'l');
595        assert_eq!(map_homoglyph('\u{280D}'), 'm');
596        assert_eq!(map_homoglyph('\u{281D}'), 'n');
597        assert_eq!(map_homoglyph('\u{2815}'), 'o');
598        assert_eq!(map_homoglyph('\u{280F}'), 'p');
599        assert_eq!(map_homoglyph('\u{281F}'), 'q');
600        assert_eq!(map_homoglyph('\u{2817}'), 'r');
601        assert_eq!(map_homoglyph('\u{280E}'), 's');
602        assert_eq!(map_homoglyph('\u{281E}'), 't');
603        assert_eq!(map_homoglyph('\u{2825}'), 'u');
604        assert_eq!(map_homoglyph('\u{2827}'), 'v');
605        assert_eq!(map_homoglyph('\u{283A}'), 'w');
606        assert_eq!(map_homoglyph('\u{282D}'), 'x');
607        assert_eq!(map_homoglyph('\u{283D}'), 'y');
608        assert_eq!(map_homoglyph('\u{2835}'), 'z');
609    }
610
611    #[test]
612    fn test_braille_blank_to_space() {
613        assert_eq!(map_homoglyph('\u{2800}'), ' ');
614    }
615
616    #[test]
617    fn test_braille_word_hello() {
618        // ⠓⠑⠇⠇⠕ → "hello"
619        assert_eq!(
620            normalise_text("\u{2813}\u{2811}\u{2807}\u{2807}\u{2815}"),
621            "hello"
622        );
623    }
624
625    #[test]
626    fn test_braille_word_ignore() {
627        // ⠊⠛⠝⠕⠗⠑ → "ignore"
628        assert_eq!(
629            normalise_text("\u{280A}\u{281B}\u{281D}\u{2815}\u{2817}\u{2811}"),
630            "ignore"
631        );
632    }
633
634    #[test]
635    fn test_braille_with_spaces() {
636        // ⠊⠛⠝⠕⠗⠑⠀⠞⠓⠊⠎ → "ignore this"
637        assert_eq!(
638            normalise_text(
639                "\u{280A}\u{281B}\u{281D}\u{2815}\u{2817}\u{2811}\u{2800}\u{281E}\u{2813}\u{280A}\u{280E}"
640            ),
641            "ignore this"
642        );
643    }
644
645    // -- Diacritics stripping (IS-031) -------------------------------------
646
647    #[test]
648    fn test_diacritics_cafe() {
649        assert_eq!(normalise_text("café"), "cafe");
650    }
651
652    #[test]
653    fn test_diacritics_resume() {
654        assert_eq!(normalise_text("résumé"), "resume");
655    }
656
657    #[test]
658    fn test_diacritics_naive() {
659        assert_eq!(normalise_text("naïve"), "naive");
660    }
661
662    #[test]
663    fn test_diacritics_ignore_evasion() {
664        // "ïgnörë" → "ignore"
665        assert_eq!(normalise_text("ïgnörë"), "ignore");
666    }
667
668    #[test]
669    fn test_diacritics_multiple_accents() {
670        // Various accented Latin characters
671        assert_eq!(normalise_text("àáâãäå"), "aaaaaa");
672        assert_eq!(normalise_text("èéêë"), "eeee");
673        assert_eq!(normalise_text("ñ"), "n");
674    }
675
676    #[test]
677    fn test_strip_diacritics_standalone() {
678        assert_eq!(strip_diacritics("café"), "cafe");
679        assert_eq!(strip_diacritics("résumé"), "resume");
680        assert_eq!(strip_diacritics("naïve"), "naive");
681    }
682
683    // -- Emoji stripping (IS-020) ------------------------------------------
684
685    #[test]
686    fn test_strip_emoji_simple() {
687        assert_eq!(normalise_text("he😀llo"), "hello");
688    }
689
690    #[test]
691    fn test_strip_emoji_multiple() {
692        assert_eq!(normalise_text("ig🔥no📌re"), "ignore");
693    }
694
695    #[test]
696    fn test_strip_emoji_skin_tone() {
697        // Waving hand + skin tone modifier — both should be stripped
698        assert_eq!(normalise_text("a\u{1F44B}\u{1F3FD}b"), "ab");
699    }
700
701    #[test]
702    fn test_strip_emoji_zwj_sequence() {
703        // Family emoji ZWJ sequence: 👨‍👩‍👧‍👦
704        // ZWJ (U+200D) is already in ZERO_WIDTH_CHARS, individual emoji are stripped
705        assert_eq!(
706            normalise_text("a\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}b"),
707            "ab"
708        );
709    }
710
711    #[test]
712    fn test_strip_emoji_variation_selectors() {
713        // Variation selector should be stripped
714        assert_eq!(normalise_text("a\u{FE0F}b"), "ab");
715    }
716
717    #[test]
718    fn test_strip_emoji_misc_symbols() {
719        // ☀ (U+2600) — misc symbols range
720        assert_eq!(normalise_text("a\u{2600}b"), "ab");
721    }
722
723    #[test]
724    fn test_strip_emoji_dingbats() {
725        // ✂ (U+2702) — dingbats range
726        assert_eq!(normalise_text("a\u{2702}b"), "ab");
727    }
728
729    #[test]
730    fn test_strip_emoji_transport() {
731        // 🚀 (U+1F680) — transport range
732        assert_eq!(normalise_text("a\u{1F680}b"), "ab");
733    }
734
735    #[test]
736    fn test_strip_emoji_standalone_function() {
737        assert_eq!(strip_emoji("hello 🌍 world"), "hello  world");
738        assert_eq!(strip_emoji("ig🔥no📌re"), "ignore");
739        assert_eq!(strip_emoji("no emoji here"), "no emoji here");
740    }
741
742    #[test]
743    fn test_strip_emoji_preserves_text_between() {
744        assert_eq!(
745            normalise_text("Ignore 🎯 previous 🔥 instructions"),
746            "Ignore  previous  instructions"
747        );
748    }
749
750    // -- Combined attacks --------------------------------------------------
751
752    #[test]
753    fn test_combined_zero_width_and_homoglyph() {
754        // "ign\u{200B}\u{043E}re" — zero-width space + Cyrillic о
755        let malicious = "ign\u{200B}\u{043E}re";
756        assert_eq!(normalise_text(malicious), "ignore");
757    }
758
759    #[test]
760    fn test_combined_fullwidth_and_zero_width() {
761        // Fullwidth "S" + zero-width + "ystem"
762        let malicious = "\u{FF33}\u{200B}ystem";
763        assert_eq!(normalise_text(malicious), "System");
764    }
765
766    #[test]
767    fn test_realistic_evasion_ignore_previous_instructions() {
768        // Attacker uses: Cyrillic і, zero-width space, Cyrillic о
769        let evasion = "\u{0456}gn\u{200B}\u{043E}re previ\u{043E}us instructi\u{043E}ns";
770        let normalised = normalise_text(evasion);
771        assert_eq!(normalised, "ignore previous instructions");
772    }
773
774    #[test]
775    fn test_combined_emoji_and_diacritics() {
776        // Emoji interleaved with accented text
777        assert_eq!(normalise_text("ïg🔥nörë"), "ignore");
778    }
779
780    #[test]
781    fn test_combined_emoji_and_upside_down() {
782        // Upside-down text with emoji interleaved
783        assert_eq!(normalise_text("\u{0265}😀\u{01DD}llo"), "hello");
784    }
785
786    #[test]
787    fn test_combined_braille_and_zero_width() {
788        // Braille "hello" with zero-width chars inserted
789        assert_eq!(
790            normalise_text("\u{2813}\u{200B}\u{2811}\u{200C}\u{2807}\u{2807}\u{2815}"),
791            "hello"
792        );
793    }
794
795    #[test]
796    fn test_combined_all_evasion_techniques() {
797        // A single string mixing: diacritics + zero-width + Cyrillic homoglyph +
798        // emoji + upside-down + tag characters
799        let evasion = concat!(
800            "ï",         // i with diaeresis → i (diacritics)
801            "\u{200B}",  // zero-width space (stripped)
802            "\u{0441}",  // Cyrillic с → c (homoglyph)
803            "🔥",        // emoji (stripped)
804            "\u{0250}",  // ɐ → a (upside-down)
805            "\u{E0041}", // tag A (stripped)
806            "\u{0287}",  // ʇ → t (upside-down)
807        );
808        assert_eq!(normalise_text(evasion), "icat");
809    }
810
811    #[test]
812    fn test_empty_string() {
813        assert_eq!(normalise_text(""), "");
814    }
815
816    #[test]
817    fn test_only_zero_width_chars() {
818        assert_eq!(normalise_text("\u{200B}\u{200C}\u{200D}\u{FEFF}"), "");
819    }
820
821    #[test]
822    fn test_preserves_normal_unicode() {
823        // CJK text should pass through unchanged (emoji is stripped)
824        let text = "你好世界";
825        assert_eq!(normalise_text(text), text);
826    }
827
828    #[test]
829    fn test_emoji_stripped_from_cjk_text() {
830        // Emoji next to CJK: emoji stripped, CJK preserved
831        assert_eq!(normalise_text("你好世界 🌍"), "你好世界 ");
832    }
833
834    #[test]
835    fn test_diacritics_stripped_from_accented_latin() {
836        // Accented characters have diacritics removed for security analysis
837        let result = normalise_text("café résumé naïve");
838        assert_eq!(result, "cafe resume naive");
839    }
840}