str_utils/
remove_all_invisible_characters.rs

1use alloc::{borrow::Cow, str::from_utf8_unchecked};
2
3use crate::to_substring_in_place;
4
5/// To extend `str` and `Cow<str>` to have `remove_all_invisible_characters` method.
6pub trait RemoveInvisibleCharacters<'a> {
7    /// Removes all invisible or non-printable characters from a given string.
8    ///
9    /// This function filters out a comprehensive set of Unicode characters that are typically
10    /// invisible or used for control or formatting purposes. This includes:
11    ///
12    /// - ASCII control characters (U+0000 to U+001F and U+007F), exclude `\t` and `\n`
13    /// - Zero-width characters and format controls:
14    ///   - U+200B (Zero Width Space)
15    ///   - U+200C (Zero Width Non-Joiner)
16    ///   - U+200D (Zero Width Joiner)
17    ///   - U+200E to U+200F (Directional marks)
18    ///   - U+202A to U+202E (Directional formatting)
19    ///   - U+2060 to U+2064 (Word Joiner and Invisible Math Symbols)
20    ///   - U+2066 to U+2069 (Bidi Isolates)
21    ///   - U+FEFF (Byte Order Mark / Zero Width No-Break Space)
22    ///
23    /// These characters can interfere with text rendering, parsing, and display,
24    /// and are often used in text-based attacks (e.g., for spoofing).
25    fn remove_all_invisible_characters(self) -> Cow<'a, str>;
26}
27
28impl<'a> RemoveInvisibleCharacters<'a> for &'a str {
29    fn remove_all_invisible_characters(self) -> Cow<'a, str> {
30        let s = self;
31        let bytes = s.as_bytes();
32
33        let length = bytes.len();
34
35        let mut p = 0;
36
37        let check_character_whether_to_remove = |p: usize, e: u8, width: usize| -> bool {
38            match width {
39                1 => {
40                    match e {
41                        // ascii controls, only remain \t, \n
42                        0..=8 | 11..=13 | 14..=31 | 127 => return true,
43                        _ => (),
44                    }
45                },
46                3 => match e {
47                    0xE2 => match bytes[p + 1] {
48                        // zero width characters and bidirectional controls
49                        0x80 => match bytes[p + 2] {
50                            0x8B..=0x8F | 0xAA..=0xAE => return true,
51                            _ => (),
52                        },
53                        // word joiner, invisible times/separator/plus and isolate characters
54                        0x81 => match bytes[p + 2] {
55                            0xA0 | 0xA2..=0xA4 | 0xA6..=0xA9 => return true,
56                            _ => (),
57                        },
58                        _ => (),
59                    },
60                    // zero width character
61                    0xEF => {
62                        if bytes[p + 1] == 0xBB && bytes[p + 2] == 0xBF {
63                            return true;
64                        }
65                    },
66                    _ => (),
67                },
68                _ => (),
69            }
70
71            false
72        };
73
74        let width = loop {
75            if p == length {
76                return Cow::Borrowed(s);
77            }
78
79            let e = bytes[p];
80
81            let width = unsafe { utf8_width::get_width_assume_valid(e) };
82
83            if check_character_whether_to_remove(p, e, width) {
84                break width;
85            } else {
86                p += width;
87            }
88        };
89
90        let heading_normal_characters_end_index = p;
91
92        p += width;
93
94        // there are four situations which can use a string slice:
95        // 1. <invisible_characters>
96        // 2. <normal_characters><invisible_characters>
97        // 3. <invisible_characters><normal_characters>
98        // 4. <invisible_characters><normal_characters><invisible_characters>
99
100        // continue to find more invisible characters
101        let width = loop {
102            if p == length {
103                // situation 1 or situation 2
104
105                return Cow::Borrowed(unsafe {
106                    from_utf8_unchecked(&bytes[..heading_normal_characters_end_index])
107                });
108            }
109
110            let e = bytes[p];
111
112            let width = unsafe { utf8_width::get_width_assume_valid(e) };
113
114            if check_character_whether_to_remove(p, e, width) {
115                p += width;
116            } else {
117                break width;
118            }
119        };
120
121        let following_invisible_characters_end_index = p;
122
123        p += width;
124
125        // continue to find more normal characters
126        let width = loop {
127            if p == length {
128                // situation 3
129
130                return Cow::Borrowed(unsafe {
131                    from_utf8_unchecked(&bytes[following_invisible_characters_end_index..])
132                });
133            }
134
135            let e = bytes[p];
136
137            let width = unsafe { utf8_width::get_width_assume_valid(e) };
138
139            if check_character_whether_to_remove(p, e, width) {
140                break width;
141            } else {
142                p += width;
143            }
144        };
145
146        let following_normal_characters_end_index = p;
147
148        p += width;
149
150        // continue to find more invisible characters
151        let width = loop {
152            if p == length {
153                // situation 4
154
155                return Cow::Borrowed(unsafe {
156                    from_utf8_unchecked(
157                        &bytes[following_invisible_characters_end_index
158                            ..following_normal_characters_end_index],
159                    )
160                });
161            }
162
163            let e = bytes[p];
164
165            let width = unsafe { utf8_width::get_width_assume_valid(e) };
166
167            if check_character_whether_to_remove(p, e, width) {
168                p += width;
169            } else {
170                break width;
171            }
172        };
173
174        let mut new_v = bytes
175            [following_invisible_characters_end_index..following_normal_characters_end_index]
176            .to_vec();
177
178        let mut start = p;
179
180        p += width;
181
182        loop {
183            if p == length {
184                break;
185            }
186
187            let e = bytes[p];
188
189            let width = unsafe { utf8_width::get_width_assume_valid(e) };
190
191            if check_character_whether_to_remove(p, e, width) {
192                new_v.extend_from_slice(&bytes[start..p]);
193
194                start = p + width;
195            }
196
197            p += width;
198        }
199
200        new_v.extend_from_slice(&bytes[start..p]);
201
202        Cow::Owned(unsafe { String::from_utf8_unchecked(new_v) })
203    }
204}
205
206impl<'a> RemoveInvisibleCharacters<'a> for Cow<'a, str> {
207    #[inline]
208    fn remove_all_invisible_characters(self) -> Cow<'a, str> {
209        match self {
210            Cow::Borrowed(s) => s.remove_all_invisible_characters(),
211            Cow::Owned(s) => match s.remove_all_invisible_characters() {
212                Cow::Borrowed(ss) => Cow::Owned(unsafe { to_substring_in_place!(s, ss) }),
213                Cow::Owned(s) => Cow::Owned(s),
214            },
215        }
216    }
217}