boa_parser/source/
utf8.rs

1use super::ReadChar;
2use std::io::{self, Bytes, Read};
3
4/// Input for UTF-8 encoded sources.
5#[derive(Debug)]
6pub struct UTF8Input<R> {
7    input: Bytes<R>,
8}
9
10impl<R: Read> UTF8Input<R> {
11    /// Creates a new `UTF8Input` from a UTF-8 encoded source.
12    pub(crate) fn new(iter: R) -> Self {
13        Self {
14            #[allow(clippy::unbuffered_bytes)]
15            input: iter.bytes(),
16        }
17    }
18}
19
20impl<R: Read> UTF8Input<R> {
21    /// Retrieves the next byte
22    fn next_byte(&mut self) -> io::Result<Option<u8>> {
23        self.input.next().transpose()
24    }
25}
26
27impl<R: Read> ReadChar for UTF8Input<R> {
28    /// Retrieves the next unchecked char in u32 code point.
29    fn next_char(&mut self) -> io::Result<Option<u32>> {
30        // Decode UTF-8
31        let x = match self.next_byte()? {
32            Some(b) if b >= 128 => b,         // UTF-8 codepoint
33            b => return Ok(b.map(u32::from)), // ASCII or None
34        };
35
36        // Multibyte case follows
37        // Decode from a byte combination out of: [[[x y] z] w]
38        // NOTE: Performance is sensitive to the exact formulation here
39        let init = utf8_first_byte(x, 2);
40        let y = self.next_byte()?.unwrap_or(0);
41        let mut ch = utf8_acc_cont_byte(init, y);
42        if x >= 0xE0 {
43            // [[x y z] w] case
44            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
45            let z = self.next_byte()?.unwrap_or(0);
46            let y_z = utf8_acc_cont_byte(u32::from(y & CONT_MASK), z);
47            ch = (init << 12) | y_z;
48            if x >= 0xF0 {
49                // [x y z w] case
50                // use only the lower 3 bits of `init`
51                let w = self.next_byte()?.unwrap_or(0);
52                ch = ((init & 7) << 18) | utf8_acc_cont_byte(y_z, w);
53            }
54        }
55
56        Ok(Some(ch))
57    }
58}
59
60/// Mask of the value bits of a continuation byte.
61const CONT_MASK: u8 = 0b0011_1111;
62
63/// Returns the initial codepoint accumulator for the first byte.
64/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
65/// for width 3, and 3 bits for width 4.
66fn utf8_first_byte(byte: u8, width: u32) -> u32 {
67    u32::from(byte & (0x7F >> width))
68}
69
70/// Returns the value of `ch` updated with continuation byte `byte`.
71fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
72    (ch << 6) | u32::from(byte & CONT_MASK)
73}