granit_parser/input/
str.rs

1use crate::{
2    char_traits::{is_blank_or_breakz, is_breakz, is_flow},
3    input::{BorrowedInput, Input, SkipTabs},
4};
5use alloc::string::String;
6
7/// A parser input backed by a `&str`.
8#[allow(clippy::module_name_repetitions)]
9pub struct StrInput<'a> {
10    /// The full, original input string.
11    ///
12    /// This is kept to support O(1) byte-offset capture and zero-copy slicing via the optional
13    /// [`Input::byte_offset`] / [`Input::slice_bytes`] APIs.
14    original: &'a str,
15    /// The remaining input slice.
16    ///
17    /// This is a moving window into [`Self::original`]. All consuming operations advance this
18    /// slice.
19    buffer: &'a str,
20    /// The number of characters we have looked ahead.
21    ///
22    /// This tracks how many characters the parser asked us to look ahead for so we can return the
23    /// correct value in [`Self::buflen`].
24    lookahead: usize,
25}
26
27impl<'a> StrInput<'a> {
28    /// Create a new [`StrInput`] over the given string slice.
29    #[must_use]
30    pub fn new(input: &'a str) -> Self {
31        Self {
32            original: input,
33            buffer: input,
34            lookahead: 0,
35        }
36    }
37
38    /// Return the number of bytes consumed from the original input.
39    ///
40    /// This is an O(1) operation derived from the invariant that [`Self::buffer`] is always a
41    /// suffix of [`Self::original`].
42    #[inline]
43    #[must_use]
44    fn consumed_bytes(&self) -> usize {
45        self.original.len() - self.buffer.len()
46    }
47}
48
49impl Input for StrInput<'_> {
50    #[inline]
51    fn lookahead(&mut self, x: usize) {
52        // We already have all characters that we need.
53        // We cannot add '\0's to the buffer when we reach EOF.
54        // Character-retrieving functions return '\0' when they read past EOF.
55        self.lookahead = self.lookahead.max(x);
56    }
57
58    #[inline]
59    fn buflen(&self) -> usize {
60        self.lookahead
61    }
62
63    #[inline]
64    fn bufmaxlen(&self) -> usize {
65        BUFFER_LEN
66    }
67
68    fn buf_is_empty(&self) -> bool {
69        self.buflen() == 0
70    }
71
72    #[inline]
73    fn raw_read_ch(&mut self) -> char {
74        let mut chars = self.buffer.chars();
75        if let Some(c) = chars.next() {
76            self.buffer = chars.as_str();
77            c
78        } else {
79            '\0'
80        }
81    }
82
83    #[inline]
84    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
85        if let Some((c, sub_str)) = split_first_char(self.buffer) {
86            if is_breakz(c) {
87                None
88            } else {
89                self.buffer = sub_str;
90                Some(c)
91            }
92        } else {
93            None
94        }
95    }
96
97    #[inline]
98    fn skip(&mut self) {
99        if !self.buffer.is_empty() {
100            let b = self.buffer.as_bytes()[0];
101            if b < 0x80 {
102                self.buffer = &self.buffer[1..];
103            } else {
104                let mut chars = self.buffer.chars();
105                chars.next();
106                self.buffer = chars.as_str();
107            }
108        }
109    }
110
111    #[inline]
112    fn skip_n(&mut self, count: usize) {
113        let mut chars = self.buffer.chars();
114        for _ in 0..count {
115            if chars.next().is_none() {
116                break;
117            }
118        }
119        self.buffer = chars.as_str();
120    }
121
122    #[inline]
123    fn peek(&self) -> char {
124        if self.buffer.is_empty() {
125            return '\0';
126        }
127        let b = self.buffer.as_bytes()[0];
128        if b < 0x80 {
129            b as char
130        } else {
131            self.buffer.chars().next().unwrap()
132        }
133    }
134
135    #[inline]
136    fn peek_nth(&self, n: usize) -> char {
137        if n == 0 {
138            return self.peek();
139        }
140        let bytes = self.buffer.as_bytes();
141        if n == 1 && bytes.len() >= 2 && bytes[0] < 0x80 && bytes[1] < 0x80 {
142            return bytes[1] as char;
143        }
144        let mut chars = self.buffer.chars();
145        for _ in 0..n {
146            if chars.next().is_none() {
147                return '\0';
148            }
149        }
150        chars.next().unwrap_or('\0')
151    }
152
153    #[inline]
154    fn byte_offset(&self) -> Option<usize> {
155        Some(self.consumed_bytes())
156    }
157
158    #[inline]
159    fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
160        debug_assert!(start <= end);
161        debug_assert!(end <= self.original.len());
162        self.original.get(start..end)
163    }
164
165    #[inline]
166    fn look_ch(&mut self) -> char {
167        self.lookahead(1);
168        self.peek()
169    }
170
171    #[inline]
172    fn next_char_is(&self, c: char) -> bool {
173        self.peek() == c
174    }
175
176    #[inline]
177    fn nth_char_is(&self, n: usize, c: char) -> bool {
178        self.peek_nth(n) == c
179    }
180
181    #[inline]
182    fn next_2_are(&self, c1: char, c2: char) -> bool {
183        let mut chars = self.buffer.chars();
184        chars.next() == Some(c1) && chars.next() == Some(c2)
185    }
186
187    #[inline]
188    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
189        let mut chars = self.buffer.chars();
190        chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3)
191    }
192
193    #[inline]
194    fn next_is_document_indicator(&self) -> bool {
195        if self.buffer.len() < 3 {
196            false
197        } else {
198            // Since all characters we look for are ASCII, we can directly use the byte API of str.
199            let bytes = self.buffer.as_bytes();
200            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
201                && (bytes[0] == b'.' || bytes[0] == b'-')
202                && bytes[0] == bytes[1]
203                && bytes[1] == bytes[2]
204        }
205    }
206
207    #[inline]
208    fn next_is_document_start(&self) -> bool {
209        if self.buffer.len() < 3 {
210            false
211        } else {
212            // Since all characters we look for are ASCII, we can directly use the byte API of str.
213            let bytes = self.buffer.as_bytes();
214            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
215                && bytes[0] == b'-'
216                && bytes[1] == b'-'
217                && bytes[2] == b'-'
218        }
219    }
220
221    #[inline]
222    fn next_is_document_end(&self) -> bool {
223        if self.buffer.len() < 3 {
224            false
225        } else {
226            // Since all characters we look for are ASCII, we can directly use the byte API of str.
227            let bytes = self.buffer.as_bytes();
228            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
229                && bytes[0] == b'.'
230                && bytes[1] == b'.'
231                && bytes[2] == b'.'
232        }
233    }
234
235    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
236        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
237
238        let mut new_str = self.buffer;
239        let mut has_yaml_ws = false;
240        let mut encountered_tab = false;
241
242        // Separate loops keep the fast space-only path while still tracking whether tabs were seen.
243        if skip_tabs == SkipTabs::Yes {
244            loop {
245                if let Some(sub_str) = new_str.strip_prefix(' ') {
246                    has_yaml_ws = true;
247                    new_str = sub_str;
248                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
249                    encountered_tab = true;
250                    new_str = sub_str;
251                } else {
252                    break;
253                }
254            }
255        } else {
256            while let Some(sub_str) = new_str.strip_prefix(' ') {
257                has_yaml_ws = true;
258                new_str = sub_str;
259            }
260        }
261
262        // All characters consumed were ASCII. We can use the byte length difference to count the
263        // number of whitespace ignored.
264        let mut chars_consumed = self.buffer.len() - new_str.len();
265
266        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
267            if !encountered_tab && !has_yaml_ws {
268                return (
269                    chars_consumed,
270                    Err("comments must be separated from other tokens by whitespace"),
271                );
272            }
273
274            // Skip remaining characters until we hit a breakz.
275            while let Some((c, sub_str)) = split_first_char(new_str) {
276                if is_breakz(c) {
277                    break;
278                }
279                new_str = sub_str;
280                chars_consumed += 1;
281            }
282        }
283
284        self.buffer = new_str;
285
286        (
287            chars_consumed,
288            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
289        )
290    }
291
292    #[allow(clippy::inline_always)]
293    #[inline(always)]
294    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
295        let nc = self.peek_nth(1);
296        match self.peek() {
297            // indicators can end a plain scalar, see 7.3.3. Plain Style
298            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
299            c if in_flow && is_flow(c) => false,
300            _ => true,
301        }
302    }
303
304    #[inline]
305    fn next_is_blank_or_break(&self) -> bool {
306        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | b'\n' | b'\r')
307    }
308
309    #[inline]
310    fn next_is_blank_or_breakz(&self) -> bool {
311        self.buffer.is_empty()
312            || matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | 0 | b'\n' | b'\r')
313    }
314
315    #[inline]
316    fn next_is_blank(&self) -> bool {
317        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t')
318    }
319
320    #[inline]
321    fn next_is_break(&self) -> bool {
322        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b'\n' | b'\r')
323    }
324
325    #[inline]
326    fn next_is_breakz(&self) -> bool {
327        self.buffer.is_empty() || matches!(self.buffer.as_bytes()[0], 0 | b'\n' | b'\r')
328    }
329
330    #[inline]
331    fn next_is_z(&self) -> bool {
332        self.buffer.is_empty() || self.buffer.as_bytes()[0] == 0
333    }
334
335    #[inline]
336    fn next_is_flow(&self) -> bool {
337        !self.buffer.is_empty()
338            && matches!(self.buffer.as_bytes()[0], b',' | b'[' | b']' | b'{' | b'}')
339    }
340
341    #[inline]
342    fn next_is_digit(&self) -> bool {
343        !self.buffer.is_empty() && self.buffer.as_bytes()[0].is_ascii_digit()
344    }
345
346    /// Check if the next character is an ASCII alphanumeric, `_`, or `-`.
347    ///
348    /// This is used as a heuristic for error detection (e.g., when `:` is followed
349    /// by tab and then a potential value character). The ASCII-only check is intentional:
350    /// it catches common cases like `key:\tvalue` while avoiding false positives for
351    /// valid YAML constructs. Unicode value starters (e.g., `äöü`) are not detected,
352    /// but such cases will still fail to parse (with a less specific error message).
353    #[inline]
354    fn next_is_alpha(&self) -> bool {
355        !self.buffer.is_empty()
356            && matches!(self.buffer.as_bytes()[0], b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-')
357    }
358
359    fn skip_while_non_breakz(&mut self) -> usize {
360        let mut byte_pos = 0;
361        let mut chars_consumed = 0;
362
363        for (i, c) in self.buffer.char_indices() {
364            if is_breakz(c) {
365                break;
366            }
367            byte_pos = i + c.len_utf8();
368            chars_consumed += 1;
369        }
370
371        self.buffer = &self.buffer[byte_pos..];
372        chars_consumed
373    }
374
375    #[inline]
376    fn skip_while_blank(&mut self) -> usize {
377        let bytes = self.buffer.as_bytes();
378
379        let mut i = 0;
380        while i < bytes.len() {
381            match bytes[i] {
382                b' ' | b'\t' => i += 1,
383                _ => break,
384            }
385        }
386
387        self.buffer = &self.buffer[i..];
388        i
389    }
390
391    /// Fetch characters matching `is_alpha` (ASCII alphanumeric, `_`, `-`).
392    ///
393    /// This is used for scanning tag handles (e.g., `!foo!`). Per YAML 1.2 spec,
394    /// tag handles use `ns-word-char` which is `[0-9a-zA-Z-]`. Our implementation
395    /// is slightly more permissive by also accepting `_`, but this is harmless
396    /// and matches common practice. Unicode characters like `ä` or `π` are NOT
397    /// valid in tag handles per spec, so the ASCII-only byte-based scanning here
398    /// is both correct and efficient.
399    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
400        let bytes = self.buffer.as_bytes();
401        let mut i = 0;
402
403        // All target characters are ASCII, so we can scan bytes directly.
404        while i < bytes.len() {
405            match bytes[i] {
406                b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-' => i += 1,
407                _ => break,
408            }
409        }
410
411        // All matched characters are ASCII, so we can safely slice and convert.
412        out.push_str(&self.buffer[..i]);
413        self.buffer = &self.buffer[i..];
414
415        i
416    }
417
418    fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
419        let mut byte_pos = 0;
420        let mut chars_consumed = 0;
421
422        for (i, c) in self.buffer.char_indices() {
423            if !crate::char_traits::is_yaml_non_space(c) || crate::char_traits::is_z(c) {
424                break;
425            }
426
427            byte_pos = i + c.len_utf8();
428            chars_consumed += 1;
429        }
430
431        out.push_str(&self.buffer[..byte_pos]);
432        self.buffer = &self.buffer[byte_pos..];
433
434        chars_consumed
435    }
436
437    fn fetch_plain_scalar_chunk(
438        &mut self,
439        out: &mut String,
440        _count: usize,
441        flow_level_gt_0: bool,
442    ) -> (bool, usize) {
443        let bytes = self.buffer.as_bytes();
444        let len = bytes.len();
445        let mut byte_pos = 0;
446        let mut chars_consumed = 0;
447
448        while byte_pos < len {
449            let b = bytes[byte_pos];
450            if b < 0x80 {
451                let c = b as char;
452                if crate::char_traits::is_blank_or_breakz(c) {
453                    out.push_str(&self.buffer[..byte_pos]);
454                    self.buffer = &self.buffer[byte_pos..];
455                    return (true, chars_consumed);
456                }
457                if flow_level_gt_0 && crate::char_traits::is_flow(c) {
458                    out.push_str(&self.buffer[..byte_pos]);
459                    self.buffer = &self.buffer[byte_pos..];
460                    return (true, chars_consumed);
461                }
462                if c == ':' {
463                    let next_byte = if byte_pos + 1 < len {
464                        bytes[byte_pos + 1]
465                    } else {
466                        0
467                    };
468                    // ASCII optimization: if next_byte >= 0x80, it is not blank/breakz/flow
469                    let is_stop = if next_byte < 0x80 {
470                        let nc = next_byte as char;
471                        crate::char_traits::is_blank_or_breakz(nc)
472                            || (flow_level_gt_0 && crate::char_traits::is_flow(nc))
473                    } else {
474                        false
475                    };
476
477                    if is_stop {
478                        out.push_str(&self.buffer[..byte_pos]);
479                        self.buffer = &self.buffer[byte_pos..];
480                        return (true, chars_consumed);
481                    }
482                }
483                byte_pos += 1;
484                chars_consumed += 1;
485            } else {
486                let mut chars = self.buffer[byte_pos..].chars();
487                let c = chars.next().unwrap();
488                byte_pos += c.len_utf8();
489                chars_consumed += 1;
490            }
491        }
492
493        out.push_str(&self.buffer[..byte_pos]);
494        self.buffer = &self.buffer[byte_pos..];
495        // If we reached here, we consumed the whole string (EOF).
496        // EOF is effectively a stop condition (breakz).
497        (true, chars_consumed)
498    }
499}
500
501impl<'a> BorrowedInput<'a> for StrInput<'a> {
502    #[inline]
503    fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str> {
504        debug_assert!(start <= end);
505        debug_assert!(end <= self.original.len());
506        self.original.get(start..end)
507    }
508}
509
510/// The buffer size we return to the scanner.
511///
512/// This does not correspond to any allocated buffer size. In practice, the scanner may request any
513/// character in the virtual buffer: characters inside the input are returned as-is, and positions
514/// past EOF return `\0`.
515///
516/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
517/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
518/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
519/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
520/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
521/// call.
522///
523/// This creates a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
524/// called with, which in turn dictates what [`buflen`] returns. In order to avoid breaking any
525/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
526/// at a time, should fit what we expect to be a good balance between memory consumption and what
527/// we expect the maximum line length to be.
528///
529/// [`lookahead`]: `StrInput::lookahead`
530/// [`bufmaxlen`]: `StrInput::bufmaxlen`
531/// [`buflen`]: `StrInput::buflen`
532const BUFFER_LEN: usize = 128;
533
534/// Splits the first character of the given string and returns it along with the rest of the
535/// string.
536#[inline]
537fn split_first_char(s: &str) -> Option<(char, &str)> {
538    let mut chars = s.chars();
539    let c = chars.next()?;
540    Some((c, chars.as_str()))
541}
542
543#[cfg(test)]
544mod test {
545    use alloc::string::String;
546
547    use crate::input::{BorrowedInput, Input, SkipTabs};
548
549    use super::StrInput;
550
551    #[test]
552    pub fn is_document_start() {
553        let input = StrInput::new("---\n");
554        assert!(input.next_is_document_start());
555        assert!(input.next_is_document_indicator());
556        let input = StrInput::new("---");
557        assert!(input.next_is_document_start());
558        assert!(input.next_is_document_indicator());
559        let input = StrInput::new("...\n");
560        assert!(!input.next_is_document_start());
561        assert!(input.next_is_document_indicator());
562        let input = StrInput::new("--- ");
563        assert!(input.next_is_document_start());
564        assert!(input.next_is_document_indicator());
565    }
566
567    #[test]
568    pub fn is_document_end() {
569        let input = StrInput::new("...\n");
570        assert!(input.next_is_document_end());
571        assert!(input.next_is_document_indicator());
572        let input = StrInput::new("...");
573        assert!(input.next_is_document_end());
574        assert!(input.next_is_document_indicator());
575        let input = StrInput::new("---\n");
576        assert!(!input.next_is_document_end());
577        assert!(input.next_is_document_indicator());
578        let input = StrInput::new("... ");
579        assert!(input.next_is_document_end());
580        assert!(input.next_is_document_indicator());
581    }
582
583    #[test]
584    fn raw_reads_track_byte_offsets_and_eof() {
585        let mut input = StrInput::new("aé");
586
587        assert_eq!(input.raw_read_ch(), 'a');
588        assert_eq!(input.byte_offset(), Some(1));
589        assert_eq!(input.raw_read_ch(), 'é');
590        assert_eq!(input.byte_offset(), Some(3));
591        assert_eq!(input.raw_read_ch(), '\0');
592        assert_eq!(input.byte_offset(), Some(3));
593    }
594
595    #[test]
596    fn raw_read_non_breakz_stops_before_breakz() {
597        let mut input = StrInput::new("a\n");
598
599        assert_eq!(input.raw_read_non_breakz_ch(), Some('a'));
600        assert_eq!(input.raw_read_non_breakz_ch(), None);
601        assert_eq!(input.peek(), '\n');
602
603        let mut empty = StrInput::new("");
604        assert_eq!(empty.raw_read_non_breakz_ch(), None);
605    }
606
607    #[test]
608    fn skip_handles_ascii_unicode_and_eof() {
609        let mut input = StrInput::new("éab");
610
611        input.skip();
612        assert_eq!(input.peek(), 'a');
613
614        input.skip_n(8);
615        assert_eq!(input.peek(), '\0');
616
617        input.skip();
618        assert_eq!(input.peek(), '\0');
619    }
620
621    #[test]
622    fn peeking_past_end_returns_nul() {
623        let ascii = StrInput::new("ab");
624        assert_eq!(ascii.peek_nth(1), 'b');
625        assert_eq!(ascii.peek_nth(3), '\0');
626
627        let unicode = StrInput::new("éab");
628        assert!(unicode.next_3_are('é', 'a', 'b'));
629        assert!(!unicode.next_3_are('é', 'a', 'c'));
630    }
631
632    #[test]
633    fn skip_ws_to_eol_without_tabs_stops_before_tab() {
634        let mut input = StrInput::new("  \t# comment\n");
635
636        let (consumed, result) = input.skip_ws_to_eol(SkipTabs::No);
637
638        assert_eq!(consumed, 2);
639        let result = result.unwrap();
640        assert!(!result.found_tabs());
641        assert!(result.has_valid_yaml_ws());
642        assert_eq!(input.peek(), '\t');
643    }
644
645    #[test]
646    fn skip_ws_to_eol_skips_comments_after_whitespace() {
647        let mut input = StrInput::new("  # comment\nnext");
648
649        let (consumed, result) = input.skip_ws_to_eol(SkipTabs::Yes);
650
651        assert_eq!(consumed, 11);
652        let result = result.unwrap();
653        assert!(!result.found_tabs());
654        assert!(result.has_valid_yaml_ws());
655        assert_eq!(input.peek(), '\n');
656    }
657
658    #[test]
659    fn skip_ws_to_eol_rejects_unseparated_comment() {
660        let mut input = StrInput::new("# comment\n");
661
662        let (consumed, result) = input.skip_ws_to_eol(SkipTabs::Yes);
663
664        assert_eq!(consumed, 0);
665        assert_eq!(
666            result.err(),
667            Some("comments must be separated from other tokens by whitespace")
668        );
669        assert_eq!(input.peek(), '#');
670    }
671
672    #[test]
673    fn fetch_while_is_alpha_is_ascii_only() {
674        let mut input = StrInput::new("abc_123-é");
675        let mut out = String::new();
676
677        assert_eq!(input.fetch_while_is_alpha(&mut out), 8);
678        assert_eq!(out, "abc_123-");
679        assert_eq!(input.peek(), 'é');
680    }
681
682    #[test]
683    fn fetch_plain_scalar_chunk_handles_non_ascii_after_colon() {
684        let mut input = StrInput::new("a:é ");
685        let mut out = String::new();
686
687        assert_eq!(
688            input.fetch_plain_scalar_chunk(&mut out, 16, false),
689            (true, 3)
690        );
691        assert_eq!(out, "a:é");
692        assert_eq!(input.peek(), ' ');
693    }
694
695    #[test]
696    fn fetch_plain_scalar_chunk_stops_at_flow_indicator() {
697        let mut input = StrInput::new("abc,def");
698        let mut out = String::new();
699
700        assert_eq!(
701            input.fetch_plain_scalar_chunk(&mut out, 16, true),
702            (true, 3)
703        );
704        assert_eq!(out, "abc");
705        assert_eq!(input.peek(), ',');
706    }
707
708    #[test]
709    fn borrowed_slices_use_original_input_lifetime() {
710        let input = StrInput::new("aéz");
711
712        assert_eq!(BorrowedInput::slice_borrowed(&input, 1, 3), Some("é"));
713        assert_eq!(input.slice_bytes(3, 4), Some("z"));
714    }
715}
granit_parser/input/str.rs

granit_parser/input/
str.rs