edit/unicode/
utf8.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use std::{hint, iter};
5
6/// An iterator over UTF-8 encoded characters.
7///
8/// This differs from [`std::str::Chars`] in that it works on unsanitized
9/// byte slices and transparently replaces invalid UTF-8 sequences with U+FFFD.
10///
11/// This follows ICU's bitmask approach for `U8_NEXT_OR_FFFD` relatively
12/// closely. This is important for compatibility, because it implements the
13/// WHATWG recommendation for UTF8 error recovery. It's also helpful, because
14/// the excellent folks at ICU have probably spent a lot of time optimizing it.
15#[derive(Clone, Copy)]
16pub struct Utf8Chars<'a> {
17    source: &'a [u8],
18    offset: usize,
19}
20
21impl<'a> Utf8Chars<'a> {
22    /// Creates a new `Utf8Chars` iterator starting at the given `offset`.
23    pub fn new(source: &'a [u8], offset: usize) -> Self {
24        Self { source, offset }
25    }
26
27    /// Returns the byte slice this iterator was created with.
28    pub fn source(&self) -> &'a [u8] {
29        self.source
30    }
31
32    /// Checks if the source is empty.
33    pub fn is_empty(&self) -> bool {
34        self.source.is_empty()
35    }
36
37    /// Returns the length of the source.
38    pub fn len(&self) -> usize {
39        self.source.len()
40    }
41
42    /// Returns the current offset in the byte slice.
43    ///
44    /// This will be past the last returned character.
45    pub fn offset(&self) -> usize {
46        self.offset
47    }
48
49    /// Sets the offset to continue iterating from.
50    pub fn seek(&mut self, offset: usize) {
51        self.offset = offset;
52    }
53
54    /// Returns true if `next` will return another character.
55    pub fn has_next(&self) -> bool {
56        self.offset < self.source.len()
57    }
58
59    // I found that on mixed 50/50 English/Non-English text,
60    // performance actually suffers when this gets inlined.
61    #[cold]
62    fn next_slow(&mut self, c: u8) -> char {
63        if self.offset >= self.source.len() {
64            return Self::fffd();
65        }
66
67        let mut cp = c as u32;
68
69        if cp < 0xE0 {
70            // UTF8-2 = %xC2-DF UTF8-tail
71
72            if cp < 0xC2 {
73                return Self::fffd();
74            }
75
76            // The lead byte is 110xxxxx
77            // -> Strip off the 110 prefix
78            cp &= !0xE0;
79        } else if cp < 0xF0 {
80            // UTF8-3 =
81            //   %xE0    %xA0-BF   UTF8-tail
82            //   %xE1-EC UTF8-tail UTF8-tail
83            //   %xED    %x80-9F   UTF8-tail
84            //   %xEE-EF UTF8-tail UTF8-tail
85
86            // This is a pretty neat approach seen in ICU4C, because it's a 1:1 translation of the RFC.
87            // I don't understand why others don't do the same thing. It's rather performant.
88            const BITS_80_9F: u8 = 1 << 0b100; // 0x80-9F, aka 0b100xxxxx
89            const BITS_A0_BF: u8 = 1 << 0b101; // 0xA0-BF, aka 0b101xxxxx
90            const BITS_BOTH: u8 = BITS_80_9F | BITS_A0_BF;
91            const LEAD_TRAIL1_BITS: [u8; 16] = [
92                //             v-- lead byte
93                BITS_A0_BF, // 0xE0
94                BITS_BOTH,  // 0xE1
95                BITS_BOTH,  // 0xE2
96                BITS_BOTH,  // 0xE3
97                BITS_BOTH,  // 0xE4
98                BITS_BOTH,  // 0xE5
99                BITS_BOTH,  // 0xE6
100                BITS_BOTH,  // 0xE7
101                BITS_BOTH,  // 0xE8
102                BITS_BOTH,  // 0xE9
103                BITS_BOTH,  // 0xEA
104                BITS_BOTH,  // 0xEB
105                BITS_BOTH,  // 0xEC
106                BITS_80_9F, // 0xED
107                BITS_BOTH,  // 0xEE
108                BITS_BOTH,  // 0xEF
109            ];
110
111            // The lead byte is 1110xxxx
112            // -> Strip off the 1110 prefix
113            cp &= !0xF0;
114
115            let t = self.source[self.offset] as u32;
116            if LEAD_TRAIL1_BITS[cp as usize] & (1 << (t >> 5)) == 0 {
117                return Self::fffd();
118            }
119            cp = (cp << 6) | (t & 0x3F);
120
121            self.offset += 1;
122            if self.offset >= self.source.len() {
123                return Self::fffd();
124            }
125        } else {
126            // UTF8-4 =
127            //   %xF0    %x90-BF   UTF8-tail UTF8-tail
128            //   %xF1-F3 UTF8-tail UTF8-tail UTF8-tail
129            //   %xF4    %x80-8F   UTF8-tail UTF8-tail
130
131            // This is similar to the above, but with the indices flipped:
132            // The trail byte is the index and the lead byte mask is the value.
133            // This is because the split at 0x90 requires more bits than fit into an u8.
134            const TRAIL1_LEAD_BITS: [u8; 16] = [
135                // --------- 0xF4 lead
136                // |         ...
137                // |   +---- 0xF0 lead
138                // v   v
139                0b_00000, //
140                0b_00000, //
141                0b_00000, //
142                0b_00000, //
143                0b_00000, //
144                0b_00000, //
145                0b_00000, // trail bytes:
146                0b_00000, //
147                0b_11110, // 0x80-8F -> 0x80-8F can be preceded by 0xF1-F4
148                0b_01111, // 0x90-9F -v
149                0b_01111, // 0xA0-AF -> 0x90-BF can be preceded by 0xF0-F3
150                0b_01111, // 0xB0-BF -^
151                0b_00000, //
152                0b_00000, //
153                0b_00000, //
154                0b_00000, //
155            ];
156
157            // The lead byte *may* be 11110xxx, but could also be e.g. 11111xxx.
158            // -> Only strip off the 1111 prefix
159            cp &= !0xF0;
160
161            // Now we can verify if it's actually <= 0xF4.
162            // Curiously, this if condition does a lot of heavy lifting for
163            // performance (+13%). I think it's just a coincidence though.
164            if cp > 4 {
165                return Self::fffd();
166            }
167
168            let t = self.source[self.offset] as u32;
169            if TRAIL1_LEAD_BITS[(t >> 4) as usize] & (1 << cp) == 0 {
170                return Self::fffd();
171            }
172            cp = (cp << 6) | (t & 0x3F);
173
174            self.offset += 1;
175            if self.offset >= self.source.len() {
176                return Self::fffd();
177            }
178
179            // UTF8-tail = %x80-BF
180            let t = (self.source[self.offset] as u32).wrapping_sub(0x80);
181            if t > 0x3F {
182                return Self::fffd();
183            }
184            cp = (cp << 6) | t;
185
186            self.offset += 1;
187            if self.offset >= self.source.len() {
188                return Self::fffd();
189            }
190        }
191
192        // SAFETY: All branches above check for `if self.offset >= self.source.len()`
193        // one way or another. This is here because the compiler doesn't get it otherwise.
194        unsafe { hint::assert_unchecked(self.offset < self.source.len()) };
195
196        // UTF8-tail = %x80-BF
197        let t = (self.source[self.offset] as u32).wrapping_sub(0x80);
198        if t > 0x3F {
199            return Self::fffd();
200        }
201        cp = (cp << 6) | t;
202
203        self.offset += 1;
204
205        // SAFETY: If `cp` wasn't a valid codepoint, we already returned U+FFFD above.
206        unsafe { char::from_u32_unchecked(cp) }
207    }
208
209    // This simultaneously serves as a `cold_path` marker.
210    // It improves performance by ~5% and reduces code size.
211    #[cold]
212    #[inline(always)]
213    fn fffd() -> char {
214        '\u{FFFD}'
215    }
216}
217
218impl Iterator for Utf8Chars<'_> {
219    type Item = char;
220
221    #[inline]
222    fn next(&mut self) -> Option<Self::Item> {
223        if self.offset >= self.source.len() {
224            return None;
225        }
226
227        let c = self.source[self.offset];
228        self.offset += 1;
229
230        // Fast-passing ASCII allows this function to be trivially inlined everywhere,
231        // as the full decoder is a little too large for that.
232        if (c & 0x80) == 0 {
233            // UTF8-1 = %x00-7F
234            Some(c as char)
235        } else {
236            // Weirdly enough, adding a hint here to assert that `next_slow`
237            // only returns codepoints >= 0x80 makes `ucd` ~5% slower.
238            Some(self.next_slow(c))
239        }
240    }
241
242    #[inline]
243    fn size_hint(&self) -> (usize, Option<usize>) {
244        // Lower bound: All remaining bytes are 4-byte sequences.
245        // Upper bound: All remaining bytes are ASCII.
246        let remaining = self.source.len() - self.offset;
247        (remaining / 4, Some(remaining))
248    }
249}
250
251impl iter::FusedIterator for Utf8Chars<'_> {}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    #[test]
258    fn test_broken_utf8() {
259        let source = [b'a', 0xED, 0xA0, 0x80, b'b'];
260        let mut chars = Utf8Chars::new(&source, 0);
261        let mut offset = 0;
262        for chunk in source.utf8_chunks() {
263            for ch in chunk.valid().chars() {
264                offset += ch.len_utf8();
265                assert_eq!(chars.next(), Some(ch));
266                assert_eq!(chars.offset(), offset);
267            }
268            if !chunk.invalid().is_empty() {
269                offset += chunk.invalid().len();
270                assert_eq!(chars.next(), Some('\u{FFFD}'));
271                assert_eq!(chars.offset(), offset);
272            }
273        }
274    }
275}