str_utils/
remove_all_invisible_characters.rs

1use alloc::{borrow::Cow, str};
2
3/// To extend types which implement `AsRef<str>` to have `remove_all_invisible_characters` method.
4pub trait RemoveInvisibleCharacters {
5    /// Removes all invisible or non-printable characters from a given string.
6    ///
7    /// This function filters out a comprehensive set of Unicode characters that are typically
8    /// invisible or used for control or formatting purposes. This includes:
9    ///
10    /// - ASCII control characters (U+0000 to U+001F and U+007F), exclude `\t` and `\n`
11    /// - Zero-width characters and format controls:
12    ///   - U+200B (Zero Width Space)
13    ///   - U+200C (Zero Width Non-Joiner)
14    ///   - U+200D (Zero Width Joiner)
15    ///   - U+200E to U+200F (Directional marks)
16    ///   - U+202A to U+202E (Directional formatting)
17    ///   - U+2060 to U+2064 (Word Joiner and Invisible Math Symbols)
18    ///   - U+2066 to U+2069 (Bidi Isolates)
19    ///   - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
20    ///
21    /// These characters can interfere with text rendering, parsing, and display,
22    /// and are often used in text-based attacks (e.g., for spoofing).
23    fn remove_all_invisible_characters(&self) -> Cow<'_, str>;
24}
25
26impl<T: AsRef<str>> RemoveInvisibleCharacters for T {
27    fn remove_all_invisible_characters(&self) -> Cow<'_, str> {
28        let s = self.as_ref();
29        let bytes = s.as_bytes();
30
31        let length = bytes.len();
32
33        let mut p = 0;
34
35        let check_character_whether_to_remove = |p: usize, e: u8, width: usize| -> bool {
36            match width {
37                1 => {
38                    match e {
39                        // ascii controls, only remain \t, \n
40                        0..=8 | 11..=13 | 14..=31 | 127 => return true,
41                        _ => (),
42                    }
43                },
44                3 => match e {
45                    0xE2 => match bytes[p + 1] {
46                        // zero width characters and bidirectional controls
47                        0x80 => match bytes[p + 2] {
48                            0x8B..=0x8F | 0xAA..=0xAE => return true,
49                            _ => (),
50                        },
51                        // word joiner, invisible times/separator/plus and isolate characters
52                        0x81 => match bytes[p + 2] {
53                            0xA0 | 0xA2..=0xA4 | 0xA6..=0xA9 => return true,
54                            _ => (),
55                        },
56                        _ => (),
57                    },
58                    // zero width character
59                    0xEF => {
60                        if bytes[p + 1] == 0xBB && bytes[p + 2] == 0xBF {
61                            return true;
62                        }
63                    },
64                    _ => (),
65                },
66                _ => (),
67            }
68
69            false
70        };
71
72        let width = loop {
73            if p == length {
74                return Cow::from(s);
75            }
76
77            let e = bytes[p];
78
79            let width = unsafe { utf8_width::get_width_assume_valid(e) };
80
81            if check_character_whether_to_remove(p, e, width) {
82                break width;
83            } else {
84                p += width;
85            }
86        };
87
88        let heading_normal_characters_end_index = p;
89
90        p += width;
91
92        // there are four situations which can use a string slice:
93        // 1. <invisible_characters>
94        // 2. <normal_characters><invisible_characters>
95        // 3. <invisible_characters><normal_characters>
96        // 4. <invisible_characters><normal_characters><invisible_characters>
97
98        // continue to find more invisible characters
99        let width = loop {
100            if p == length {
101                // situation 1 or situation 2
102
103                return Cow::from(unsafe {
104                    str::from_utf8_unchecked(&bytes[..heading_normal_characters_end_index])
105                });
106            }
107
108            let e = bytes[p];
109
110            let width = unsafe { utf8_width::get_width_assume_valid(e) };
111
112            if check_character_whether_to_remove(p, e, width) {
113                p += width;
114            } else {
115                break width;
116            }
117        };
118
119        let following_invisible_characters_end_index = p;
120
121        p += width;
122
123        // continue to find more normal characters
124        let width = loop {
125            if p == length {
126                // situation 3
127
128                return Cow::from(unsafe {
129                    str::from_utf8_unchecked(&bytes[following_invisible_characters_end_index..])
130                });
131            }
132
133            let e = bytes[p];
134
135            let width = unsafe { utf8_width::get_width_assume_valid(e) };
136
137            if check_character_whether_to_remove(p, e, width) {
138                break width;
139            } else {
140                p += width;
141            }
142        };
143
144        let following_normal_characters_end_index = p;
145
146        p += width;
147
148        // continue to find more invisible characters
149        let width = loop {
150            if p == length {
151                // situation 4
152
153                return Cow::from(unsafe {
154                    str::from_utf8_unchecked(
155                        &bytes[following_invisible_characters_end_index
156                            ..following_normal_characters_end_index],
157                    )
158                });
159            }
160
161            let e = bytes[p];
162
163            let width = unsafe { utf8_width::get_width_assume_valid(e) };
164
165            if check_character_whether_to_remove(p, e, width) {
166                p += width;
167            } else {
168                break width;
169            }
170        };
171
172        let mut new_v = bytes
173            [following_invisible_characters_end_index..following_normal_characters_end_index]
174            .to_vec();
175
176        let mut start = p;
177
178        p += width;
179
180        loop {
181            if p == length {
182                break;
183            }
184
185            let e = bytes[p];
186
187            let width = unsafe { utf8_width::get_width_assume_valid(e) };
188
189            if check_character_whether_to_remove(p, e, width) {
190                new_v.extend_from_slice(&bytes[start..p]);
191
192                start = p + width;
193            }
194
195            p += width;
196        }
197
198        new_v.extend_from_slice(&bytes[start..p]);
199
200        Cow::from(unsafe { String::from_utf8_unchecked(new_v) })
201    }
202}