Skip to main content

granit_parser/input/
str.rs

1use crate::{
2    char_traits::{is_blank_or_breakz, is_breakz, is_flow},
3    input::{BorrowedInput, Input, SkipTabs},
4};
5use alloc::string::String;
6
7/// A parser input that uses a `&str` as source.
8#[allow(clippy::module_name_repetitions)]
9pub struct StrInput<'a> {
10    /// The full, original input string.
11    ///
12    /// This is kept to support O(1) byte-offset capture and zero-copy slicing via the optional
13    /// [`Input::byte_offset`] / [`Input::slice_bytes`] APIs.
14    original: &'a str,
15    /// The input str buffer.
16    ///
17    /// This is a moving window into [`Self::original`]. All consuming operations advance this
18    /// slice.
19    buffer: &'a str,
20    /// The number of characters we have looked ahead.
21    ///
22    /// We must however keep track of how many characters the parser asked us to look ahead for so
23    /// that we can return the correct value in [`Self::buflen`].
24    lookahead: usize,
25}
26
27impl<'a> StrInput<'a> {
28    /// Create a new [`StrInput`] with the given str.
29    #[must_use]
30    pub fn new(input: &'a str) -> Self {
31        Self {
32            original: input,
33            buffer: input,
34            lookahead: 0,
35        }
36    }
37
38    /// Return the number of bytes consumed from the original input.
39    ///
40    /// This is an O(1) operation derived from the invariant that [`Self::buffer`] is always a
41    /// suffix of [`Self::original`].
42    #[inline]
43    #[must_use]
44    fn consumed_bytes(&self) -> usize {
45        self.original.len() - self.buffer.len()
46    }
47}
48
49impl Input for StrInput<'_> {
50    #[inline]
51    fn lookahead(&mut self, x: usize) {
52        // We already have all characters that we need.
53        // We cannot add '\0's to the buffer should we prematurely reach EOF.
54        // Returning '\0's befalls the character-retrieving functions.
55        self.lookahead = self.lookahead.max(x);
56    }
57
58    #[inline]
59    fn buflen(&self) -> usize {
60        self.lookahead
61    }
62
63    #[inline]
64    fn bufmaxlen(&self) -> usize {
65        BUFFER_LEN
66    }
67
68    fn buf_is_empty(&self) -> bool {
69        self.buflen() == 0
70    }
71
72    #[inline]
73    fn raw_read_ch(&mut self) -> char {
74        let mut chars = self.buffer.chars();
75        if let Some(c) = chars.next() {
76            self.buffer = chars.as_str();
77            c
78        } else {
79            '\0'
80        }
81    }
82
83    #[inline]
84    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
85        if let Some((c, sub_str)) = split_first_char(self.buffer) {
86            if is_breakz(c) {
87                None
88            } else {
89                self.buffer = sub_str;
90                Some(c)
91            }
92        } else {
93            None
94        }
95    }
96
97    #[inline]
98    fn skip(&mut self) {
99        if !self.buffer.is_empty() {
100            let b = self.buffer.as_bytes()[0];
101            if b < 0x80 {
102                self.buffer = &self.buffer[1..];
103            } else {
104                let mut chars = self.buffer.chars();
105                chars.next();
106                self.buffer = chars.as_str();
107            }
108        }
109    }
110
111    #[inline]
112    fn skip_n(&mut self, count: usize) {
113        let mut chars = self.buffer.chars();
114        for _ in 0..count {
115            if chars.next().is_none() {
116                break;
117            }
118        }
119        self.buffer = chars.as_str();
120    }
121
122    #[inline]
123    fn peek(&self) -> char {
124        if self.buffer.is_empty() {
125            return '\0';
126        }
127        let b = self.buffer.as_bytes()[0];
128        if b < 0x80 {
129            b as char
130        } else {
131            self.buffer.chars().next().unwrap()
132        }
133    }
134
135    #[inline]
136    fn peek_nth(&self, n: usize) -> char {
137        if n == 0 {
138            return self.peek();
139        }
140        let bytes = self.buffer.as_bytes();
141        if n == 1 && bytes.len() >= 2 && bytes[0] < 0x80 && bytes[1] < 0x80 {
142            return bytes[1] as char;
143        }
144        let mut chars = self.buffer.chars();
145        for _ in 0..n {
146            if chars.next().is_none() {
147                return '\0';
148            }
149        }
150        chars.next().unwrap_or('\0')
151    }
152
153    #[inline]
154    fn byte_offset(&self) -> Option<usize> {
155        Some(self.consumed_bytes())
156    }
157
158    #[inline]
159    fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
160        debug_assert!(start <= end);
161        debug_assert!(end <= self.original.len());
162        self.original.get(start..end)
163    }
164
165    #[inline]
166    fn look_ch(&mut self) -> char {
167        self.lookahead(1);
168        self.peek()
169    }
170
171    #[inline]
172    fn next_char_is(&self, c: char) -> bool {
173        self.peek() == c
174    }
175
176    #[inline]
177    fn nth_char_is(&self, n: usize, c: char) -> bool {
178        self.peek_nth(n) == c
179    }
180
181    #[inline]
182    fn next_2_are(&self, c1: char, c2: char) -> bool {
183        let mut chars = self.buffer.chars();
184        chars.next() == Some(c1) && chars.next() == Some(c2)
185    }
186
187    #[inline]
188    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
189        let mut chars = self.buffer.chars();
190        chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3)
191    }
192
193    #[inline]
194    fn next_is_document_indicator(&self) -> bool {
195        if self.buffer.len() < 3 {
196            false
197        } else {
198            // Since all characters we look for are ascii, we can directly use the byte API of str.
199            let bytes = self.buffer.as_bytes();
200            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
201                && (bytes[0] == b'.' || bytes[0] == b'-')
202                && bytes[0] == bytes[1]
203                && bytes[1] == bytes[2]
204        }
205    }
206
207    #[inline]
208    fn next_is_document_start(&self) -> bool {
209        if self.buffer.len() < 3 {
210            false
211        } else {
212            // Since all characters we look for are ascii, we can directly use the byte API of str.
213            let bytes = self.buffer.as_bytes();
214            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
215                && bytes[0] == b'-'
216                && bytes[1] == b'-'
217                && bytes[2] == b'-'
218        }
219    }
220
221    #[inline]
222    fn next_is_document_end(&self) -> bool {
223        if self.buffer.len() < 3 {
224            false
225        } else {
226            // Since all characters we look for are ascii, we can directly use the byte API of str.
227            let bytes = self.buffer.as_bytes();
228            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
229                && bytes[0] == b'.'
230                && bytes[1] == b'.'
231                && bytes[2] == b'.'
232        }
233    }
234
235    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
236        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
237
238        let mut new_str = self.buffer;
239        let mut has_yaml_ws = false;
240        let mut encountered_tab = false;
241
242        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
243        // while keeping track of whether we encountered spaces and/or tabs.
244        if skip_tabs == SkipTabs::Yes {
245            loop {
246                if let Some(sub_str) = new_str.strip_prefix(' ') {
247                    has_yaml_ws = true;
248                    new_str = sub_str;
249                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
250                    encountered_tab = true;
251                    new_str = sub_str;
252                } else {
253                    break;
254                }
255            }
256        } else {
257            while let Some(sub_str) = new_str.strip_prefix(' ') {
258                has_yaml_ws = true;
259                new_str = sub_str;
260            }
261        }
262
263        // All characters consumed were ascii. We can use the byte length difference to count the
264        // number of whitespace ignored.
265        let mut chars_consumed = self.buffer.len() - new_str.len();
266
267        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
268            if !encountered_tab && !has_yaml_ws {
269                return (
270                    chars_consumed,
271                    Err("comments must be separated from other tokens by whitespace"),
272                );
273            }
274
275            // Skip remaining characters until we hit a breakz.
276            while let Some((c, sub_str)) = split_first_char(new_str) {
277                if is_breakz(c) {
278                    break;
279                }
280                new_str = sub_str;
281                chars_consumed += 1;
282            }
283        }
284
285        self.buffer = new_str;
286
287        (
288            chars_consumed,
289            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
290        )
291    }
292
293    #[allow(clippy::inline_always)]
294    #[inline(always)]
295    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
296        let nc = self.peek_nth(1);
297        match self.peek() {
298            // indicators can end a plain scalar, see 7.3.3. Plain Style
299            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
300            c if in_flow && is_flow(c) => false,
301            _ => true,
302        }
303    }
304
305    #[inline]
306    fn next_is_blank_or_break(&self) -> bool {
307        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | b'\n' | b'\r')
308    }
309
310    #[inline]
311    fn next_is_blank_or_breakz(&self) -> bool {
312        self.buffer.is_empty()
313            || matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | 0 | b'\n' | b'\r')
314    }
315
316    #[inline]
317    fn next_is_blank(&self) -> bool {
318        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t')
319    }
320
321    #[inline]
322    fn next_is_break(&self) -> bool {
323        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b'\n' | b'\r')
324    }
325
326    #[inline]
327    fn next_is_breakz(&self) -> bool {
328        self.buffer.is_empty() || matches!(self.buffer.as_bytes()[0], 0 | b'\n' | b'\r')
329    }
330
331    #[inline]
332    fn next_is_z(&self) -> bool {
333        self.buffer.is_empty() || self.buffer.as_bytes()[0] == 0
334    }
335
336    #[inline]
337    fn next_is_flow(&self) -> bool {
338        !self.buffer.is_empty()
339            && matches!(self.buffer.as_bytes()[0], b',' | b'[' | b']' | b'{' | b'}')
340    }
341
342    #[inline]
343    fn next_is_digit(&self) -> bool {
344        !self.buffer.is_empty() && self.buffer.as_bytes()[0].is_ascii_digit()
345    }
346
347    /// Check if the next character is an ASCII alphanumeric, `_`, or `-`.
348    ///
349    /// This is used as a heuristic for error detection (e.g., when `:` is followed
350    /// by tab and then a potential value character). The ASCII-only check is intentional:
351    /// it catches common cases like `key:\tvalue` while avoiding false positives for
352    /// valid YAML constructs. Unicode value starters (e.g., `äöü`) are not detected,
353    /// but such cases will still fail to parse (with a less specific error message).
354    #[inline]
355    fn next_is_alpha(&self) -> bool {
356        !self.buffer.is_empty()
357            && matches!(self.buffer.as_bytes()[0], b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-')
358    }
359
360    fn skip_while_non_breakz(&mut self) -> usize {
361        let mut byte_pos = 0;
362        let mut chars_consumed = 0;
363
364        for (i, c) in self.buffer.char_indices() {
365            if is_breakz(c) {
366                break;
367            }
368            byte_pos = i + c.len_utf8();
369            chars_consumed += 1;
370        }
371
372        self.buffer = &self.buffer[byte_pos..];
373        chars_consumed
374    }
375
376    #[inline]
377    fn skip_while_blank(&mut self) -> usize {
378        let bytes = self.buffer.as_bytes();
379
380        let mut i = 0;
381        while i < bytes.len() {
382            match bytes[i] {
383                b' ' | b'\t' => i += 1,
384                _ => break,
385            }
386        }
387
388        self.buffer = &self.buffer[i..];
389        i
390    }
391
392    /// Fetch characters matching `is_alpha` (ASCII alphanumeric, `_`, `-`).
393    ///
394    /// This is used for scanning tag handles (e.g., `!foo!`). Per YAML 1.2 spec,
395    /// tag handles use `ns-word-char` which is `[0-9a-zA-Z-]`. Our implementation
396    /// is slightly more permissive by also accepting `_`, but this is harmless
397    /// and matches common practice. Unicode characters like `ä` or `π` are NOT
398    /// valid in tag handles per spec, so the ASCII-only byte-based scanning here
399    /// is both correct and efficient.
400    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
401        let bytes = self.buffer.as_bytes();
402        let mut i = 0;
403
404        // All target characters are ASCII, so we can scan bytes directly.
405        while i < bytes.len() {
406            match bytes[i] {
407                b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-' => i += 1,
408                _ => break,
409            }
410        }
411
412        // All matched characters are ASCII, so we can safely slice and convert.
413        out.push_str(&self.buffer[..i]);
414        self.buffer = &self.buffer[i..];
415
416        i
417    }
418
419    fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
420        let mut byte_pos = 0;
421        let mut chars_consumed = 0;
422
423        for (i, c) in self.buffer.char_indices() {
424            if !crate::char_traits::is_yaml_non_space(c) || crate::char_traits::is_z(c) {
425                break;
426            }
427
428            byte_pos = i + c.len_utf8();
429            chars_consumed += 1;
430        }
431
432        out.push_str(&self.buffer[..byte_pos]);
433        self.buffer = &self.buffer[byte_pos..];
434
435        chars_consumed
436    }
437
438    fn fetch_plain_scalar_chunk(
439        &mut self,
440        out: &mut String,
441        _count: usize,
442        flow_level_gt_0: bool,
443    ) -> (bool, usize) {
444        let bytes = self.buffer.as_bytes();
445        let len = bytes.len();
446        let mut byte_pos = 0;
447        let mut chars_consumed = 0;
448
449        while byte_pos < len {
450            let b = bytes[byte_pos];
451            if b < 0x80 {
452                let c = b as char;
453                if crate::char_traits::is_blank_or_breakz(c) {
454                    out.push_str(&self.buffer[..byte_pos]);
455                    self.buffer = &self.buffer[byte_pos..];
456                    return (true, chars_consumed);
457                }
458                if flow_level_gt_0 && crate::char_traits::is_flow(c) {
459                    out.push_str(&self.buffer[..byte_pos]);
460                    self.buffer = &self.buffer[byte_pos..];
461                    return (true, chars_consumed);
462                }
463                if c == ':' {
464                    let next_byte = if byte_pos + 1 < len {
465                        bytes[byte_pos + 1]
466                    } else {
467                        0
468                    };
469                    // ASCII optimization: if next_byte >= 0x80, it is not blank/breakz/flow
470                    let is_stop = if next_byte < 0x80 {
471                        let nc = next_byte as char;
472                        crate::char_traits::is_blank_or_breakz(nc)
473                            || (flow_level_gt_0 && crate::char_traits::is_flow(nc))
474                    } else {
475                        false
476                    };
477
478                    if is_stop {
479                        out.push_str(&self.buffer[..byte_pos]);
480                        self.buffer = &self.buffer[byte_pos..];
481                        return (true, chars_consumed);
482                    }
483                }
484                byte_pos += 1;
485                chars_consumed += 1;
486            } else {
487                let mut chars = self.buffer[byte_pos..].chars();
488                let c = chars.next().unwrap();
489                byte_pos += c.len_utf8();
490                chars_consumed += 1;
491            }
492        }
493
494        out.push_str(&self.buffer[..byte_pos]);
495        self.buffer = &self.buffer[byte_pos..];
496        // If we reached here, we consumed the whole string (EOF).
497        // EOF is effectively a stop condition (breakz).
498        (true, chars_consumed)
499    }
500}
501
502impl<'a> BorrowedInput<'a> for StrInput<'a> {
503    #[inline]
504    fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str> {
505        debug_assert!(start <= end);
506        debug_assert!(end <= self.original.len());
507        self.original.get(start..end)
508    }
509}
510
511/// The buffer size we return to the scanner.
512///
513/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
514/// any character they want. If it's within the input buffer, the given character is returned,
515/// otherwise `\0` is returned.
516///
517/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
518/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
519/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
520/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
521/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
522/// call.
523///
524/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
525/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
526/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
527/// at a time, should fit what we expect to be a good balance between memory consumption and what
528/// we expect the maximum line length to be.
529///
530/// [`lookahead`]: `StrInput::lookahead`
531/// [`bufmaxlen`]: `StrInput::bufmaxlen`
532/// [`buflen`]: `StrInput::buflen`
533const BUFFER_LEN: usize = 128;
534
535/// Splits the first character of the given string and returns it along with the rest of the
536/// string.
537#[inline]
538fn split_first_char(s: &str) -> Option<(char, &str)> {
539    let mut chars = s.chars();
540    let c = chars.next()?;
541    Some((c, chars.as_str()))
542}
543
544#[cfg(test)]
545mod test {
546    use alloc::string::String;
547
548    use crate::input::{BorrowedInput, Input, SkipTabs};
549
550    use super::StrInput;
551
552    #[test]
553    pub fn is_document_start() {
554        let input = StrInput::new("---\n");
555        assert!(input.next_is_document_start());
556        assert!(input.next_is_document_indicator());
557        let input = StrInput::new("---");
558        assert!(input.next_is_document_start());
559        assert!(input.next_is_document_indicator());
560        let input = StrInput::new("...\n");
561        assert!(!input.next_is_document_start());
562        assert!(input.next_is_document_indicator());
563        let input = StrInput::new("--- ");
564        assert!(input.next_is_document_start());
565        assert!(input.next_is_document_indicator());
566    }
567
568    #[test]
569    pub fn is_document_end() {
570        let input = StrInput::new("...\n");
571        assert!(input.next_is_document_end());
572        assert!(input.next_is_document_indicator());
573        let input = StrInput::new("...");
574        assert!(input.next_is_document_end());
575        assert!(input.next_is_document_indicator());
576        let input = StrInput::new("---\n");
577        assert!(!input.next_is_document_end());
578        assert!(input.next_is_document_indicator());
579        let input = StrInput::new("... ");
580        assert!(input.next_is_document_end());
581        assert!(input.next_is_document_indicator());
582    }
583
584    #[test]
585    fn raw_reads_track_byte_offsets_and_eof() {
586        let mut input = StrInput::new("aé");
587
588        assert_eq!(input.raw_read_ch(), 'a');
589        assert_eq!(input.byte_offset(), Some(1));
590        assert_eq!(input.raw_read_ch(), 'é');
591        assert_eq!(input.byte_offset(), Some(3));
592        assert_eq!(input.raw_read_ch(), '\0');
593        assert_eq!(input.byte_offset(), Some(3));
594    }
595
596    #[test]
597    fn raw_read_non_breakz_stops_before_breakz() {
598        let mut input = StrInput::new("a\n");
599
600        assert_eq!(input.raw_read_non_breakz_ch(), Some('a'));
601        assert_eq!(input.raw_read_non_breakz_ch(), None);
602        assert_eq!(input.peek(), '\n');
603
604        let mut empty = StrInput::new("");
605        assert_eq!(empty.raw_read_non_breakz_ch(), None);
606    }
607
608    #[test]
609    fn skip_handles_ascii_unicode_and_eof() {
610        let mut input = StrInput::new("éab");
611
612        input.skip();
613        assert_eq!(input.peek(), 'a');
614
615        input.skip_n(8);
616        assert_eq!(input.peek(), '\0');
617
618        input.skip();
619        assert_eq!(input.peek(), '\0');
620    }
621
622    #[test]
623    fn peeking_past_end_returns_nul() {
624        let ascii = StrInput::new("ab");
625        assert_eq!(ascii.peek_nth(1), 'b');
626        assert_eq!(ascii.peek_nth(3), '\0');
627
628        let unicode = StrInput::new("éab");
629        assert!(unicode.next_3_are('é', 'a', 'b'));
630        assert!(!unicode.next_3_are('é', 'a', 'c'));
631    }
632
633    #[test]
634    fn skip_ws_to_eol_without_tabs_stops_before_tab() {
635        let mut input = StrInput::new("  \t# comment\n");
636
637        let (consumed, result) = input.skip_ws_to_eol(SkipTabs::No);
638
639        assert_eq!(consumed, 2);
640        let result = result.unwrap();
641        assert!(!result.found_tabs());
642        assert!(result.has_valid_yaml_ws());
643        assert_eq!(input.peek(), '\t');
644    }
645
646    #[test]
647    fn skip_ws_to_eol_skips_comments_after_whitespace() {
648        let mut input = StrInput::new("  # comment\nnext");
649
650        let (consumed, result) = input.skip_ws_to_eol(SkipTabs::Yes);
651
652        assert_eq!(consumed, 11);
653        let result = result.unwrap();
654        assert!(!result.found_tabs());
655        assert!(result.has_valid_yaml_ws());
656        assert_eq!(input.peek(), '\n');
657    }
658
659    #[test]
660    fn fetch_while_is_alpha_is_ascii_only() {
661        let mut input = StrInput::new("abc_123-é");
662        let mut out = String::new();
663
664        assert_eq!(input.fetch_while_is_alpha(&mut out), 8);
665        assert_eq!(out, "abc_123-");
666        assert_eq!(input.peek(), 'é');
667    }
668
669    #[test]
670    fn fetch_plain_scalar_chunk_handles_non_ascii_after_colon() {
671        let mut input = StrInput::new("a:é ");
672        let mut out = String::new();
673
674        assert_eq!(
675            input.fetch_plain_scalar_chunk(&mut out, 16, false),
676            (true, 3)
677        );
678        assert_eq!(out, "a:é");
679        assert_eq!(input.peek(), ' ');
680    }
681
682    #[test]
683    fn fetch_plain_scalar_chunk_stops_at_flow_indicator() {
684        let mut input = StrInput::new("abc,def");
685        let mut out = String::new();
686
687        assert_eq!(
688            input.fetch_plain_scalar_chunk(&mut out, 16, true),
689            (true, 3)
690        );
691        assert_eq!(out, "abc");
692        assert_eq!(input.peek(), ',');
693    }
694
695    #[test]
696    fn borrowed_slices_use_original_input_lifetime() {
697        let input = StrInput::new("aéz");
698
699        assert_eq!(BorrowedInput::slice_borrowed(&input, 1, 3), Some("é"));
700        assert_eq!(input.slice_bytes(3, 4), Some("z"));
701    }
702}