str_utils/remove_all_invisible_characters.rs
1use std::borrow::Cow;
2
3/// To extend types which implement `AsRef<str>` to have `remove_all_invisible_characters` method.
4pub trait RemoveInvisibleCharacters {
5 /// Removes all invisible or non-printable characters from a given string.
6 ///
7 /// This function filters out a comprehensive set of Unicode characters that are typically
8 /// invisible or used for control or formatting purposes. This includes:
9 ///
10 /// - ASCII control characters (U+0000 to U+001F and U+007F), exclude `\t` and `\n`
11 /// - Zero-width characters and format controls:
12 /// - U+200B (Zero Width Space)
13 /// - U+200C (Zero Width Non-Joiner)
14 /// - U+200D (Zero Width Joiner)
15 /// - U+200E to U+200F (Directional marks)
16 /// - U+202A to U+202E (Directional formatting)
17 /// - U+2060 to U+2064 (Word Joiner and Invisible Math Symbols)
18 /// - U+2066 to U+2069 (Bidi Isolates)
19 /// - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
20 ///
21 /// These characters can interfere with text rendering, parsing, and display,
22 /// and are often used in text-based attacks (e.g., for spoofing).
23 fn remove_all_invisible_characters(&self) -> Cow<'_, str>;
24}
25
26impl<T: AsRef<str>> RemoveInvisibleCharacters for T {
27 fn remove_all_invisible_characters(&self) -> Cow<'_, str> {
28 let s = self.as_ref();
29 let bytes = s.as_bytes();
30
31 let length = bytes.len();
32
33 let mut p = 0;
34
35 let check_character_whether_to_remove = |p: usize, e: u8, width: usize| -> bool {
36 match width {
37 1 => {
38 match e {
39 // ascii controls, only remain \t, \n
40 0..=8 | 11..=13 | 14..=31 | 127 => return true,
41 _ => (),
42 }
43 },
44 3 => match e {
45 0xE2 => match bytes[p + 1] {
46 // zero width characters and bidirectional controls
47 0x80 => match bytes[p + 2] {
48 0x8B..=0x8F | 0xAA..=0xAE => return true,
49 _ => (),
50 },
51 // word joiner, invisible times/separator/plus and isolate characters
52 0x81 => match bytes[p + 2] {
53 0xA0 | 0xA2..=0xA4 | 0xA6..=0xA9 => return true,
54 _ => (),
55 },
56 _ => (),
57 },
58 // zero width character
59 0xEF => {
60 if bytes[p + 1] == 0xBB && bytes[p + 2] == 0xBF {
61 return true;
62 }
63 },
64 _ => (),
65 },
66 _ => (),
67 }
68
69 false
70 };
71
72 let width = loop {
73 if p == length {
74 return Cow::from(s);
75 }
76
77 let e = bytes[p];
78
79 let width = unsafe { utf8_width::get_width_assume_valid(e) };
80
81 if check_character_whether_to_remove(p, e, width) {
82 break width;
83 } else {
84 p += width;
85 }
86 };
87
88 let mut new_bytes = Vec::with_capacity(length);
89
90 new_bytes.extend_from_slice(&bytes[..p]);
91
92 p += width;
93
94 loop {
95 if p == length {
96 break;
97 }
98
99 let e = bytes[p];
100
101 let width = unsafe { utf8_width::get_width_assume_valid(e) };
102
103 if !check_character_whether_to_remove(p, e, width) {
104 new_bytes.extend_from_slice(&bytes[p..(p + width)]);
105 }
106
107 p += width;
108 }
109
110 Cow::from(unsafe { String::from_utf8_unchecked(new_bytes) })
111 }
112}