hocon_rs/parser/
read.rs

1use std::str;
2
3use derive_more::{Deref, DerefMut};
4
5use crate::Result;
6use crate::error::Error;
7
8// We should peek at least 7 bytes because the include token has a length of 7 bytes.
9pub(crate) const MAX_PEEK_N: usize = 7;
10
11pub(crate) const DEFAULT_BUFFER_SIZE: usize = 512;
12
13/// Return the length in bytes of the leading whitespace character, if any,
14/// according to the HOCON specification.
15///
16/// Whitespace includes:
17/// - ASCII whitespace + control separators (U+0009–000D, U+001C–001F, space)
18/// - U+0085 (NEL)
19/// - U+00A0 (NO-BREAK SPACE)
20/// - U+1680 (OGHAM SPACE MARK)
21/// - U+2000..=U+200A (EN QUAD..HAIR SPACE, includes U+2007 FIGURE SPACE)
22/// - U+2028, U+2029 (line/paragraph separators)
23/// - U+202F (NARROW NO-BREAK SPACE)
24/// - U+205F (MEDIUM MATHEMATICAL SPACE)
25/// - U+3000 (IDEOGRAPHIC SPACE)
26/// - U+FEFF (BOM, must be treated as whitespace)
27#[inline]
28pub fn leading_whitespace_bytes(bytes: &[u8]) -> usize {
29    if bytes.is_empty() {
30        return 0;
31    }
32    match bytes {
33        // ASCII whitespace + extra control characters U+001C..=U+001F
34        [b'\t' | b'\n' | 0x0B | 0x0C | b'\r' | b' ' | 0x1C..=0x1F, ..] => 1,
35
36        // U+0085 (NEL)
37        [0xC2, 0x85, ..] => 2,
38
39        // U+00A0 (NO-BREAK SPACE)
40        [0xC2, 0xA0, ..] => 2,
41
42        // U+1680 (OGHAM SPACE MARK)
43        [0xE1, 0x9A, 0x80, ..] => 3,
44
45        // U+2000..=U+200A (general spaces, includes U+2007 FIGURE SPACE)
46        [0xE2, 0x80, 0x80..=0x8A, ..] => 3,
47
48        // U+2028, U+2029 (line/paragraph separator)
49        [0xE2, 0x80, 0xA8..=0xA9, ..] => 3,
50
51        // U+202F (NARROW NO-BREAK SPACE)
52        [0xE2, 0x80, 0xAF, ..] => 3,
53
54        // U+205F (MEDIUM MATHEMATICAL SPACE)
55        [0xE2, 0x81, 0x9F, ..] => 3,
56
57        // U+3000 (IDEOGRAPHIC SPACE)
58        [0xE3, 0x80, 0x80, ..] => 3,
59
60        // U+FEFF (BOM)
61        [0xEF, 0xBB, 0xBF, ..] => 3,
62
63        _ => 0, // not whitespace
64    }
65}
66
67fn parse_escaped_char<'de, R: Read<'de>>(reader: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
68    let ch = reader.next()?;
69    match ch {
70        b'"' => scratch.push(b'"'),
71        b'\\' => scratch.push(b'\\'),
72        b'/' => scratch.push(b'/'),
73        b'b' => scratch.push(b'\x08'),
74        b'f' => scratch.push(b'\x0c'),
75        b'n' => scratch.push(b'\n'),
76        b'r' => scratch.push(b'\r'),
77        b't' => scratch.push(b'\t'),
78        b'u' => parse_escaped_unicode(reader, scratch)?,
79        _ => return Err(Error::InvalidEscape),
80    }
81    Ok(())
82}
83
84/// Parses a Unicode escape sequence of the form `\uXXXX` (and possibly a surrogate pair).
85///
86/// This function reads exactly 4 hexadecimal digits after `\u` and converts them into
87/// a Unicode code point. If the code point is in the high-surrogate range (`0xD800..=0xDBFF`),
88/// it expects another `\uXXXX` low surrogate (`0xDC00..=0xDFFF`) to follow and combines
89/// them into a single supplementary character.
90///
91/// The resulting Unicode scalar value is then encoded as UTF-8 and appended to `scratch`.
92///
93/// # Arguments
94/// * `reader`  - The input reader providing bytes (typically HOCON/JSON parser input).
95/// * `scratch` - A temporary buffer to which the decoded UTF-8 bytes are appended.
96///
97/// # Errors
98/// Returns `Error::InvalidEscape` if:
99/// - the escape sequence is malformed,
100/// - contains invalid hex digits,
101/// - contains an unpaired surrogate,
102/// - or produces an invalid Unicode code point.
103///
104/// # Safety
105/// This implementation uses `char::from_u32` + `encode_utf8` to guarantee that only valid
106/// Unicode scalar values are emitted, avoiding panics or undefined behavior.
107///
108/// # Example
109/// ```ignore
110/// // parsing "\u0041" should append 'A'
111/// let mut buf = Vec::new();
112/// let mut input = SliceReader::new(br"0041"); // hypothetical reader
113/// parse_escaped_unicode(&mut input, &mut buf).unwrap();
114/// assert_eq!(buf, b"A");
115/// ```
116fn parse_escaped_unicode<'de, R: Read<'de>>(reader: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
117    /// Reads 4 hexadecimal digits (`\uXXXX`) and returns a `u16`.
118    fn parse_hex16<'de, R: Read<'de>>(reader: &mut R) -> Result<u16> {
119        let mut n: u16 = 0;
120        for _ in 0..4 {
121            let b = reader.next()?;
122            n = match b {
123                b'0'..=b'9' => (n << 4) | (b - b'0') as u16,
124                b'a'..=b'f' => (n << 4) | (10 + b - b'a') as u16,
125                b'A'..=b'F' => (n << 4) | (10 + b - b'A') as u16,
126                _ => return Err(Error::InvalidEscape),
127            };
128        }
129        Ok(n)
130    }
131
132    // Parse first 4 hex digits
133    let mut n = parse_hex16(reader)? as u32;
134
135    // Handle surrogate pair (UTF-16 encoding for supplementary characters)
136    if (0xD800..=0xDBFF).contains(&n) {
137        // Expect `\u` for the low surrogate
138        if reader.next()? != b'\\' || reader.next()? != b'u' {
139            return Err(Error::InvalidEscape);
140        }
141        let n2 = parse_hex16(reader)? as u32;
142        if !(0xDC00..=0xDFFF).contains(&n2) {
143            return Err(Error::InvalidEscape);
144        }
145        // Combine surrogate pair into a single code point
146        n = 0x10000 + (((n - 0xD800) << 10) | (n2 - 0xDC00));
147    }
148
149    // Convert to `char` and encode as UTF-8
150    if let Some(ch) = char::from_u32(n) {
151        let mut buf = [0u8; 4];
152        scratch.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
153        Ok(())
154    } else {
155        Err(Error::InvalidEscape)
156    }
157}
158
159#[derive(Debug, Clone, Copy)]
160pub struct Position {
161    pub line: usize,
162    pub column: usize,
163}
164
165pub enum Reference<'b, 'c, T>
166where
167    T: ?Sized + 'static,
168{
169    Borrowed(&'b T),
170    Copied(&'c T),
171}
172
173impl<'b, 'c, T> std::ops::Deref for Reference<'b, 'c, T>
174where
175    T: ?Sized + 'static,
176{
177    type Target = T;
178
179    fn deref(&self) -> &Self::Target {
180        match *self {
181            Reference::Borrowed(b) => b,
182            Reference::Copied(c) => c,
183        }
184    }
185}
186
187pub trait Read<'de> {
188    fn position(&self) -> Position;
189
190    fn peek_n(&mut self, n: usize) -> Result<&[u8]>;
191
192    #[inline]
193    fn peek(&mut self) -> Result<u8> {
194        let chars = self.peek_n(1)?;
195        Ok(chars[0])
196    }
197
198    #[inline]
199    fn peek2(&mut self) -> Result<(u8, u8)> {
200        let chars = self.peek_n(2)?;
201        Ok((chars[0], chars[1]))
202    }
203
204    fn next(&mut self) -> Result<u8>;
205
206    #[inline]
207    fn discard(&mut self, n: usize) -> Result<()> {
208        for _ in 0..n {
209            self.next()?;
210        }
211        Ok(())
212    }
213
214    fn parse_str<'s, F>(
215        &'s mut self,
216        escape: bool,
217        scratch: &'s mut Vec<u8>,
218        delimiter: F,
219    ) -> Result<Reference<'de, 's, str>>
220    where
221        F: Fn(&mut Self) -> Result<bool>;
222
223    #[inline]
224    fn peek_whitespace(&mut self) -> Result<Option<usize>> {
225        let n = match self.peek_n(3) {
226            Ok(bytes) => leading_whitespace_bytes(bytes),
227            Err(Error::Eof) => match self.peek_n(2) {
228                Ok(bytes) => leading_whitespace_bytes(bytes),
229                Err(Error::Eof) => match self.peek_n(1) {
230                    Ok(bytes) => leading_whitespace_bytes(bytes),
231                    Err(err) => {
232                        return Err(err);
233                    }
234                },
235                Err(err) => return Err(err),
236            },
237            Err(err) => return Err(err),
238        };
239        if n > 0 { Ok(Some(n)) } else { Ok(None) }
240    }
241
242    #[inline]
243    fn starts_with_whitespace(&mut self) -> Result<bool> {
244        self.peek_whitespace().map(|n| n.is_some())
245    }
246
247    #[inline]
248    fn peek_horizontal_whitespace(&mut self) -> Result<Option<usize>> {
249        if self.peek()? != b'\n' {
250            self.peek_whitespace()
251        } else {
252            Ok(None)
253        }
254    }
255
256    #[inline]
257    fn starts_with_horizontal_whitespace(&mut self) -> Result<bool> {
258        self.peek_horizontal_whitespace().map(|n| n.is_some())
259    }
260}
261
262pub struct StreamRead<R: std::io::Read> {
263    inner: R,
264    buffer: [u8; DEFAULT_BUFFER_SIZE],
265    head: usize,
266    tail: usize,
267    eof: bool,
268    line: usize,
269    col: usize,
270}
271
272impl<R: std::io::Read> StreamRead<R> {
273    pub fn new(reader: R) -> Self {
274        StreamRead {
275            inner: reader,
276            buffer: [0u8; _],
277            head: 0,
278            tail: 0,
279            eof: false,
280            line: 0,
281            col: 0,
282        }
283    }
284
285    fn fill_buf(&mut self) -> Result<()> {
286        if self.eof {
287            return Err(Error::Eof);
288        }
289
290        // 如果 buffer 已经满了，就不能再读
291        if self.tail == self.buffer.len() {
292            return Ok(());
293        }
294
295        let empty_buf = &mut self.buffer[self.tail..];
296        let n = self.inner.read(empty_buf)?;
297        if n == 0 {
298            self.eof = true;
299        }
300        self.tail += n;
301        Ok(())
302    }
303
304    #[inline]
305    fn available_data_len(&self) -> usize {
306        self.tail - self.head
307    }
308}
309
310impl<'de, R: std::io::Read> Read<'de> for StreamRead<R> {
311    fn position(&self) -> Position {
312        Position {
313            line: self.line,
314            column: self.col,
315        }
316    }
317
318    #[inline]
319    fn peek_n(&mut self, n: usize) -> Result<&[u8]> {
320        debug_assert!(n > 0 && n <= MAX_PEEK_N);
321
322        if self.available_data_len() < n && !self.eof {
323            // 如果 buffer 已经写满但数据不够 -> 搬移一下
324            if self.tail == self.buffer.len() && self.head > 0 {
325                let len = self.tail - self.head;
326                self.buffer.copy_within(self.head..self.tail, 0);
327                self.head = 0;
328                self.tail = len;
329            }
330            self.fill_buf()?;
331        }
332        if self.available_data_len() < n {
333            Err(Error::Eof)
334        } else {
335            Ok(&self.buffer[self.head..self.head + n])
336        }
337    }
338
339    #[inline]
340    fn next(&mut self) -> Result<u8> {
341        if self.available_data_len() == 0 && !self.eof {
342            self.fill_buf()?;
343        }
344        let byte = self.buffer[self.head];
345        if byte == b'\n' {
346            self.line += 1;
347        } else {
348            self.col += 1;
349        }
350        self.head += 1;
351        if self.head == self.tail {
352            self.head = 0;
353            self.tail = 0;
354        }
355        Ok(byte)
356    }
357
358    #[inline]
359    fn parse_str<'s, F>(
360        &'s mut self,
361        escape: bool,
362        scratch: &'s mut Vec<u8>,
363        delimiter: F,
364    ) -> Result<Reference<'de, 's, str>>
365    where
366        F: Fn(&mut Self) -> Result<bool>,
367    {
368        loop {
369            if !delimiter(self)? {
370                match self.next()? {
371                    b'\\' if escape => {
372                        parse_escaped_char(self, scratch)?;
373                    }
374                    ch => {
375                        scratch.push(ch);
376                    }
377                }
378            } else {
379                break;
380            }
381        }
382        str::from_utf8(scratch)
383            .map_err(|_| Error::InvalidUtf8)
384            .map(Reference::Copied)
385    }
386}
387
388macro_rules! parse_str_bytes_impl {
389    ($self:expr, $escape:expr, $scratch:expr, $delimiter:expr, $result:expr) => {{
390        let mut start = $self.index;
391        loop {
392            if !$delimiter($self)? {
393                if $self.index == $self.slice.len() {
394                    break;
395                }
396                match $self.slice[$self.index] {
397                    b'\\' if $escape => {
398                        $scratch.extend_from_slice(&$self.slice[start..$self.index]);
399                        $self.index += 1;
400                        parse_escaped_char($self, $scratch)?;
401                        start = $self.index;
402                    }
403                    _ => {
404                        $self.index += 1;
405                    }
406                }
407            } else {
408                break;
409            }
410        }
411        if $scratch.is_empty() {
412            let borrowed = &$self.slice[start..$self.index];
413            $result(borrowed).map(Reference::Borrowed)
414        } else {
415            $scratch.extend_from_slice(&$self.slice[start..$self.index]);
416            $result($scratch).map(Reference::Copied)
417        }
418    }};
419}
420
421pub struct SliceRead<'de> {
422    slice: &'de [u8],
423    index: usize,
424}
425
426impl<'de> SliceRead<'de> {
427    pub fn new(slice: &'de [u8]) -> Self {
428        SliceRead { slice, index: 0 }
429    }
430
431    fn position_of_index(&self, i: usize) -> Position {
432        let start_of_line = match memchr::memrchr(b'\n', &self.slice[..i]) {
433            Some(position) => position + 1,
434            None => 0,
435        };
436        Position {
437            line: 1 + memchr::memchr_iter(b'\n', &self.slice[..start_of_line]).count(),
438            column: i - start_of_line,
439        }
440    }
441
442    #[inline]
443    fn available_data_len(&self) -> usize {
444        self.slice.len() - self.index
445    }
446
447    pub(crate) fn rest(&self) -> &[u8] {
448        &self.slice[self.index..]
449    }
450
451    #[inline]
452    fn parse_str_bytes<'s, E, T, R>(
453        &'s mut self,
454        escape: bool,
455        scratch: &'s mut Vec<u8>,
456        delimiter: E,
457        result: R,
458    ) -> Result<Reference<'de, 's, T>>
459    where
460        T: ?Sized + 's,
461        E: Fn(&mut Self) -> Result<bool>,
462        R: for<'f> FnOnce(&'f [u8]) -> Result<&'f T>,
463    {
464        parse_str_bytes_impl!(self, escape, scratch, delimiter, result)
465    }
466}
467
468impl<'de> Read<'de> for SliceRead<'de> {
469    fn position(&self) -> Position {
470        self.position_of_index(self.index)
471    }
472
473    #[inline]
474    fn peek_n(&mut self, n: usize) -> Result<&[u8]> {
475        debug_assert!(n > 0 && n <= MAX_PEEK_N);
476        if self.available_data_len() < n {
477            Err(Error::Eof)
478        } else {
479            Ok(&self.slice[self.index..self.index + n])
480        }
481    }
482
483    #[inline]
484    fn next(&mut self) -> Result<u8> {
485        if self.index == self.slice.len() {
486            return Err(Error::Eof);
487        }
488        let byte = self.slice[self.index];
489        self.index += 1;
490        Ok(byte)
491    }
492
493    fn discard(&mut self, n: usize) -> Result<()> {
494        if self.available_data_len() < n {
495            Err(Error::Eof)
496        } else {
497            self.index += n;
498            Ok(())
499        }
500    }
501
502    #[inline]
503    fn parse_str<'s, F>(
504        &'s mut self,
505        escape: bool,
506        scratch: &'s mut Vec<u8>,
507        end: F,
508    ) -> Result<Reference<'de, 's, str>>
509    where
510        F: Fn(&mut Self) -> Result<bool>,
511    {
512        self.parse_str_bytes(escape, scratch, end, |bytes| {
513            str::from_utf8(bytes).map_err(|_| Error::InvalidUtf8)
514        })
515    }
516}
517
518#[derive(Deref, DerefMut)]
519pub struct StrRead<'de> {
520    delegate: SliceRead<'de>,
521}
522
523impl<'de> StrRead<'de> {
524    pub fn new(s: &'de str) -> Self {
525        Self {
526            delegate: SliceRead::new(s.as_bytes()),
527        }
528    }
529
530    pub fn rest(&self) -> Result<&str> {
531        str::from_utf8(self.delegate.rest()).map_err(|_| Error::InvalidUtf8)
532    }
533
534    #[inline]
535    fn parse_str_bytes<'s, E, T, R>(
536        &'s mut self,
537        no_escape: bool,
538        scratch: &'s mut Vec<u8>,
539        delimiter: E,
540        result: R,
541    ) -> Result<Reference<'de, 's, T>>
542    where
543        T: ?Sized + 's,
544        E: Fn(&mut Self) -> Result<bool>,
545        R: for<'f> FnOnce(&'f [u8]) -> Result<&'f T>,
546    {
547        parse_str_bytes_impl!(self, no_escape, scratch, delimiter, result)
548    }
549}
550
551impl<'de> Read<'de> for StrRead<'de> {
552    fn position(&self) -> Position {
553        self.delegate.position()
554    }
555
556    #[inline]
557    fn peek_n(&mut self, n: usize) -> Result<&[u8]> {
558        self.delegate.peek_n(n)
559    }
560
561    #[inline]
562    fn next(&mut self) -> Result<u8> {
563        self.delegate.next()
564    }
565
566    #[inline]
567    fn parse_str<'s, F>(
568        &'s mut self,
569        no_escape: bool,
570        scratch: &'s mut Vec<u8>,
571        end: F,
572    ) -> Result<Reference<'de, 's, str>>
573    where
574        F: Fn(&mut Self) -> Result<bool>,
575    {
576        self.parse_str_bytes(no_escape, scratch, end, |bytes| {
577            Ok(unsafe { str::from_utf8_unchecked(bytes) })
578        })
579    }
580}
581
582#[cfg(test)]
583mod tests {
584    use crate::Result;
585    use crate::parser::read::leading_whitespace_bytes;
586    use crate::parser::read::{Read, StreamRead};
587    use rstest::rstest;
588
589    #[test]
590    fn test_stream_peek() -> Result<()> {
591        let input = "hello world";
592        let mut read = StreamRead::new(input.as_bytes());
593        let ch = read.peek()?;
594        assert_eq!(ch, b'h');
595        let (ch1, ch2) = read.peek2()?;
596        assert_eq!(ch1, b'h');
597        assert_eq!(ch2, b'e');
598        let chars = read.peek_n(3)?;
599        assert_eq!(chars, b"hel");
600        read.discard(3)?;
601        let ch = read.peek()?;
602        assert_eq!(ch, b'l');
603        let (ch1, ch2) = read.peek2()?;
604        assert_eq!(ch1, b'l');
605        assert_eq!(ch2, b'o');
606        let chars = read.peek_n(3)?;
607        assert_eq!(chars, b"lo ");
608        Ok(())
609    }
610
611    #[rstest]
612    #[case(&[] as &[u8], 0)]
613    #[case(b"\txyz", 1)]
614    #[case(b"\nabc", 1)]
615    #[case(&[0x0B, b'a', b'b'], 1)]
616    #[case(&[0x0C, b'a', b'b'], 1)]
617    #[case(b"\rHELLO", 1)]
618    #[case(b" world", 1)]
619    #[case(&[0x1C, b'X', b'Y'], 1)]
620    #[case(&[0x1F, b'Z'], 1)]
621    #[case(&[0xC2, 0x85, b'a', b'b'], 2)]
622    #[case(&[0xC2, 0xA0, b'X'], 2)]
623    #[case(&[0xE1, 0x9A, 0x80, b'!'], 3)]
624    #[case(&[0xE2, 0x80, 0x80, b'a'], 3)]
625    #[case(&[0xE2, 0x80, 0x87, b'b'], 3)]
626    #[case(&[0xE2, 0x80, 0x8A, b'c'], 3)]
627    #[case(&[0xE2, 0x80, 0xA8, b'x'], 3)]
628    #[case(&[0xE2, 0x80, 0xA9, b'y'], 3)]
629    #[case(&[0xE2, 0x80, 0xAF, b'Z'], 3)]
630    #[case(&[0xE2, 0x81, 0x9F, b'M'], 3)]
631    #[case(&[0xE3, 0x80, 0x80, b'A'], 3)]
632    #[case(&[0xEF, 0xBB, 0xBF, b'h'], 3)]
633    #[case(b"Hello", 0)]
634    #[case(&[0xE6, 0x97, 0xA5, b'X'], 0)]
635    #[case(&[0xC2], 0)]
636    #[case(&[0xE2, 0x80], 0)]
637    fn test_leading_whitespace_bytes(#[case] bytes: &[u8], #[case] expected: usize) {
638        assert_eq!(leading_whitespace_bytes(bytes), expected);
639    }
640}
hocon_rs/parser/read.rs

hocon_rs/parser/
read.rs