str_utils/
remove_all_invisible_characters.rs1use alloc::{borrow::Cow, str::from_utf8_unchecked};
2
3use crate::to_substring_in_place;
4
5pub trait RemoveInvisibleCharacters<'a> {
7 fn remove_all_invisible_characters(self) -> Cow<'a, str>;
26}
27
28impl<'a> RemoveInvisibleCharacters<'a> for &'a str {
29 fn remove_all_invisible_characters(self) -> Cow<'a, str> {
30 let s = self;
31 let bytes = s.as_bytes();
32
33 let length = bytes.len();
34
35 let mut p = 0;
36
37 let check_character_whether_to_remove = |p: usize, e: u8, width: usize| -> bool {
38 match width {
39 1 => {
40 match e {
41 0..=8 | 11..=13 | 14..=31 | 127 => return true,
43 _ => (),
44 }
45 },
46 3 => match e {
47 0xE2 => match bytes[p + 1] {
48 0x80 => match bytes[p + 2] {
50 0x8B..=0x8F | 0xAA..=0xAE => return true,
51 _ => (),
52 },
53 0x81 => match bytes[p + 2] {
55 0xA0 | 0xA2..=0xA4 | 0xA6..=0xA9 => return true,
56 _ => (),
57 },
58 _ => (),
59 },
60 0xEF => {
62 if bytes[p + 1] == 0xBB && bytes[p + 2] == 0xBF {
63 return true;
64 }
65 },
66 _ => (),
67 },
68 _ => (),
69 }
70
71 false
72 };
73
74 let width = loop {
75 if p == length {
76 return Cow::Borrowed(s);
77 }
78
79 let e = bytes[p];
80
81 let width = unsafe { utf8_width::get_width_assume_valid(e) };
82
83 if check_character_whether_to_remove(p, e, width) {
84 break width;
85 } else {
86 p += width;
87 }
88 };
89
90 let heading_normal_characters_end_index = p;
91
92 p += width;
93
94 let width = loop {
102 if p == length {
103 return Cow::Borrowed(unsafe {
106 from_utf8_unchecked(&bytes[..heading_normal_characters_end_index])
107 });
108 }
109
110 let e = bytes[p];
111
112 let width = unsafe { utf8_width::get_width_assume_valid(e) };
113
114 if check_character_whether_to_remove(p, e, width) {
115 p += width;
116 } else {
117 break width;
118 }
119 };
120
121 let following_invisible_characters_end_index = p;
122
123 p += width;
124
125 let width = loop {
127 if p == length {
128 return Cow::Borrowed(unsafe {
131 from_utf8_unchecked(&bytes[following_invisible_characters_end_index..])
132 });
133 }
134
135 let e = bytes[p];
136
137 let width = unsafe { utf8_width::get_width_assume_valid(e) };
138
139 if check_character_whether_to_remove(p, e, width) {
140 break width;
141 } else {
142 p += width;
143 }
144 };
145
146 let following_normal_characters_end_index = p;
147
148 p += width;
149
150 let width = loop {
152 if p == length {
153 return Cow::Borrowed(unsafe {
156 from_utf8_unchecked(
157 &bytes[following_invisible_characters_end_index
158 ..following_normal_characters_end_index],
159 )
160 });
161 }
162
163 let e = bytes[p];
164
165 let width = unsafe { utf8_width::get_width_assume_valid(e) };
166
167 if check_character_whether_to_remove(p, e, width) {
168 p += width;
169 } else {
170 break width;
171 }
172 };
173
174 let mut new_v = bytes
175 [following_invisible_characters_end_index..following_normal_characters_end_index]
176 .to_vec();
177
178 let mut start = p;
179
180 p += width;
181
182 loop {
183 if p == length {
184 break;
185 }
186
187 let e = bytes[p];
188
189 let width = unsafe { utf8_width::get_width_assume_valid(e) };
190
191 if check_character_whether_to_remove(p, e, width) {
192 new_v.extend_from_slice(&bytes[start..p]);
193
194 start = p + width;
195 }
196
197 p += width;
198 }
199
200 new_v.extend_from_slice(&bytes[start..p]);
201
202 Cow::Owned(unsafe { String::from_utf8_unchecked(new_v) })
203 }
204}
205
206impl<'a> RemoveInvisibleCharacters<'a> for Cow<'a, str> {
207 #[inline]
208 fn remove_all_invisible_characters(self) -> Cow<'a, str> {
209 match self {
210 Cow::Borrowed(s) => s.remove_all_invisible_characters(),
211 Cow::Owned(s) => match s.remove_all_invisible_characters() {
212 Cow::Borrowed(ss) => Cow::Owned(unsafe { to_substring_in_place!(s, ss) }),
213 Cow::Owned(s) => Cow::Owned(s),
214 },
215 }
216 }
217}