edit/unicode/utf8.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use std::{hint, iter};
5
6/// An iterator over UTF-8 encoded characters.
7///
8/// This differs from [`std::str::Chars`] in that it works on unsanitized
9/// byte slices and transparently replaces invalid UTF-8 sequences with U+FFFD.
10///
11/// This follows ICU's bitmask approach for `U8_NEXT_OR_FFFD` relatively
12/// closely. This is important for compatibility, because it implements the
13/// WHATWG recommendation for UTF8 error recovery. It's also helpful, because
14/// the excellent folks at ICU have probably spent a lot of time optimizing it.
15#[derive(Clone, Copy)]
16pub struct Utf8Chars<'a> {
17 source: &'a [u8],
18 offset: usize,
19}
20
21impl<'a> Utf8Chars<'a> {
22 /// Creates a new `Utf8Chars` iterator starting at the given `offset`.
23 pub fn new(source: &'a [u8], offset: usize) -> Self {
24 Self { source, offset }
25 }
26
27 /// Returns the byte slice this iterator was created with.
28 pub fn source(&self) -> &'a [u8] {
29 self.source
30 }
31
32 /// Checks if the source is empty.
33 pub fn is_empty(&self) -> bool {
34 self.source.is_empty()
35 }
36
37 /// Returns the length of the source.
38 pub fn len(&self) -> usize {
39 self.source.len()
40 }
41
42 /// Returns the current offset in the byte slice.
43 ///
44 /// This will be past the last returned character.
45 pub fn offset(&self) -> usize {
46 self.offset
47 }
48
49 /// Sets the offset to continue iterating from.
50 pub fn seek(&mut self, offset: usize) {
51 self.offset = offset;
52 }
53
54 /// Returns true if `next` will return another character.
55 pub fn has_next(&self) -> bool {
56 self.offset < self.source.len()
57 }
58
59 // I found that on mixed 50/50 English/Non-English text,
60 // performance actually suffers when this gets inlined.
61 #[cold]
62 fn next_slow(&mut self, c: u8) -> char {
63 if self.offset >= self.source.len() {
64 return Self::fffd();
65 }
66
67 let mut cp = c as u32;
68
69 if cp < 0xE0 {
70 // UTF8-2 = %xC2-DF UTF8-tail
71
72 if cp < 0xC2 {
73 return Self::fffd();
74 }
75
76 // The lead byte is 110xxxxx
77 // -> Strip off the 110 prefix
78 cp &= !0xE0;
79 } else if cp < 0xF0 {
80 // UTF8-3 =
81 // %xE0 %xA0-BF UTF8-tail
82 // %xE1-EC UTF8-tail UTF8-tail
83 // %xED %x80-9F UTF8-tail
84 // %xEE-EF UTF8-tail UTF8-tail
85
86 // This is a pretty neat approach seen in ICU4C, because it's a 1:1 translation of the RFC.
87 // I don't understand why others don't do the same thing. It's rather performant.
88 const BITS_80_9F: u8 = 1 << 0b100; // 0x80-9F, aka 0b100xxxxx
89 const BITS_A0_BF: u8 = 1 << 0b101; // 0xA0-BF, aka 0b101xxxxx
90 const BITS_BOTH: u8 = BITS_80_9F | BITS_A0_BF;
91 const LEAD_TRAIL1_BITS: [u8; 16] = [
92 // v-- lead byte
93 BITS_A0_BF, // 0xE0
94 BITS_BOTH, // 0xE1
95 BITS_BOTH, // 0xE2
96 BITS_BOTH, // 0xE3
97 BITS_BOTH, // 0xE4
98 BITS_BOTH, // 0xE5
99 BITS_BOTH, // 0xE6
100 BITS_BOTH, // 0xE7
101 BITS_BOTH, // 0xE8
102 BITS_BOTH, // 0xE9
103 BITS_BOTH, // 0xEA
104 BITS_BOTH, // 0xEB
105 BITS_BOTH, // 0xEC
106 BITS_80_9F, // 0xED
107 BITS_BOTH, // 0xEE
108 BITS_BOTH, // 0xEF
109 ];
110
111 // The lead byte is 1110xxxx
112 // -> Strip off the 1110 prefix
113 cp &= !0xF0;
114
115 let t = self.source[self.offset] as u32;
116 if LEAD_TRAIL1_BITS[cp as usize] & (1 << (t >> 5)) == 0 {
117 return Self::fffd();
118 }
119 cp = (cp << 6) | (t & 0x3F);
120
121 self.offset += 1;
122 if self.offset >= self.source.len() {
123 return Self::fffd();
124 }
125 } else {
126 // UTF8-4 =
127 // %xF0 %x90-BF UTF8-tail UTF8-tail
128 // %xF1-F3 UTF8-tail UTF8-tail UTF8-tail
129 // %xF4 %x80-8F UTF8-tail UTF8-tail
130
131 // This is similar to the above, but with the indices flipped:
132 // The trail byte is the index and the lead byte mask is the value.
133 // This is because the split at 0x90 requires more bits than fit into an u8.
134 const TRAIL1_LEAD_BITS: [u8; 16] = [
135 // --------- 0xF4 lead
136 // | ...
137 // | +---- 0xF0 lead
138 // v v
139 0b_00000, //
140 0b_00000, //
141 0b_00000, //
142 0b_00000, //
143 0b_00000, //
144 0b_00000, //
145 0b_00000, // trail bytes:
146 0b_00000, //
147 0b_11110, // 0x80-8F -> 0x80-8F can be preceded by 0xF1-F4
148 0b_01111, // 0x90-9F -v
149 0b_01111, // 0xA0-AF -> 0x90-BF can be preceded by 0xF0-F3
150 0b_01111, // 0xB0-BF -^
151 0b_00000, //
152 0b_00000, //
153 0b_00000, //
154 0b_00000, //
155 ];
156
157 // The lead byte *may* be 11110xxx, but could also be e.g. 11111xxx.
158 // -> Only strip off the 1111 prefix
159 cp &= !0xF0;
160
161 // Now we can verify if it's actually <= 0xF4.
162 // Curiously, this if condition does a lot of heavy lifting for
163 // performance (+13%). I think it's just a coincidence though.
164 if cp > 4 {
165 return Self::fffd();
166 }
167
168 let t = self.source[self.offset] as u32;
169 if TRAIL1_LEAD_BITS[(t >> 4) as usize] & (1 << cp) == 0 {
170 return Self::fffd();
171 }
172 cp = (cp << 6) | (t & 0x3F);
173
174 self.offset += 1;
175 if self.offset >= self.source.len() {
176 return Self::fffd();
177 }
178
179 // UTF8-tail = %x80-BF
180 let t = (self.source[self.offset] as u32).wrapping_sub(0x80);
181 if t > 0x3F {
182 return Self::fffd();
183 }
184 cp = (cp << 6) | t;
185
186 self.offset += 1;
187 if self.offset >= self.source.len() {
188 return Self::fffd();
189 }
190 }
191
192 // SAFETY: All branches above check for `if self.offset >= self.source.len()`
193 // one way or another. This is here because the compiler doesn't get it otherwise.
194 unsafe { hint::assert_unchecked(self.offset < self.source.len()) };
195
196 // UTF8-tail = %x80-BF
197 let t = (self.source[self.offset] as u32).wrapping_sub(0x80);
198 if t > 0x3F {
199 return Self::fffd();
200 }
201 cp = (cp << 6) | t;
202
203 self.offset += 1;
204
205 // SAFETY: If `cp` wasn't a valid codepoint, we already returned U+FFFD above.
206 unsafe { char::from_u32_unchecked(cp) }
207 }
208
209 // This simultaneously serves as a `cold_path` marker.
210 // It improves performance by ~5% and reduces code size.
211 #[cold]
212 #[inline(always)]
213 fn fffd() -> char {
214 '\u{FFFD}'
215 }
216}
217
218impl Iterator for Utf8Chars<'_> {
219 type Item = char;
220
221 #[inline]
222 fn next(&mut self) -> Option<Self::Item> {
223 if self.offset >= self.source.len() {
224 return None;
225 }
226
227 let c = self.source[self.offset];
228 self.offset += 1;
229
230 // Fast-passing ASCII allows this function to be trivially inlined everywhere,
231 // as the full decoder is a little too large for that.
232 if (c & 0x80) == 0 {
233 // UTF8-1 = %x00-7F
234 Some(c as char)
235 } else {
236 // Weirdly enough, adding a hint here to assert that `next_slow`
237 // only returns codepoints >= 0x80 makes `ucd` ~5% slower.
238 Some(self.next_slow(c))
239 }
240 }
241
242 #[inline]
243 fn size_hint(&self) -> (usize, Option<usize>) {
244 // Lower bound: All remaining bytes are 4-byte sequences.
245 // Upper bound: All remaining bytes are ASCII.
246 let remaining = self.source.len() - self.offset;
247 (remaining / 4, Some(remaining))
248 }
249}
250
251impl iter::FusedIterator for Utf8Chars<'_> {}
252
253#[cfg(test)]
254mod tests {
255 use super::*;
256
257 #[test]
258 fn test_broken_utf8() {
259 let source = [b'a', 0xED, 0xA0, 0x80, b'b'];
260 let mut chars = Utf8Chars::new(&source, 0);
261 let mut offset = 0;
262 for chunk in source.utf8_chunks() {
263 for ch in chunk.valid().chars() {
264 offset += ch.len_utf8();
265 assert_eq!(chars.next(), Some(ch));
266 assert_eq!(chars.offset(), offset);
267 }
268 if !chunk.invalid().is_empty() {
269 offset += chunk.invalid().len();
270 assert_eq!(chars.next(), Some('\u{FFFD}'));
271 assert_eq!(chars.offset(), offset);
272 }
273 }
274 }
275}