Skip to main content

obeli_sk_boa_parser/source/
utf16.rs

1use super::ReadChar;
2use std::io;
3
4/// Input for UTF-16 encoded sources.
5#[derive(Debug)]
6pub struct UTF16Input<'a> {
7    input: &'a [u16],
8    index: usize,
9}
10
11impl<'a> UTF16Input<'a> {
12    /// Creates a new `UTF16Input` from a UTF-16 encoded slice e.g. <code>[&\[u16\]][slice]</code>.
13    ///
14    /// [slice]: std::slice
15    #[must_use]
16    pub const fn new(input: &'a [u16]) -> Self {
17        Self { input, index: 0 }
18    }
19
20    // use `#[cold]` to hint to branch predictor that surrogate pairs are rare
21    #[cold]
22    fn handle_surrogate_pair(&mut self, u1: u16) -> u32 {
23        let Some(u2) = self.input.get(self.index).copied() else {
24            return u1.into();
25        };
26
27        // If the code unit is not a low surrogate, it is not a surrogate pair.
28        if !is_low_surrogate(u2) {
29            return u1.into();
30        }
31
32        self.index += 1;
33
34        code_point_from_surrogates(u1, u2)
35    }
36}
37
38impl ReadChar for UTF16Input<'_> {
39    /// Retrieves the next unchecked char in u32 code point.
40    fn next_char(&mut self) -> io::Result<Option<u32>> {
41        let Some(u1) = self.input.get(self.index).copied() else {
42            return Ok(None);
43        };
44
45        self.index += 1;
46
47        // If the code unit is not a high surrogate, it is not the start of a surrogate pair.
48        if !is_high_surrogate(u1) {
49            return Ok(Some(u1.into()));
50        }
51
52        Ok(Some(self.handle_surrogate_pair(u1)))
53    }
54}
55
56const SURROGATE_HIGH_START: u16 = 0xD800;
57const SURROGATE_HIGH_END: u16 = 0xDBFF;
58const SURROGATE_LOW_START: u16 = 0xDC00;
59const SURROGATE_LOW_END: u16 = 0xDFFF;
60
61fn is_high_surrogate(b: u16) -> bool {
62    (SURROGATE_HIGH_START..=SURROGATE_HIGH_END).contains(&b)
63}
64
65fn is_low_surrogate(b: u16) -> bool {
66    (SURROGATE_LOW_START..=SURROGATE_LOW_END).contains(&b)
67}
68
69fn code_point_from_surrogates(high: u16, low: u16) -> u32 {
70    (((u32::from(high & 0x3ff)) << 10) | u32::from(low & 0x3ff)) + 0x1_0000
71}