Skip to main content

granit_parser/
input.rs

1//! Utilities to create a source of input to the parser.
2//!
3//! [`Input`] must be implemented for the parser to fetch input. Make sure your needs aren't
4//! covered by the [`BufferedInput`].
5
6use alloc::string::String;
7
8pub(crate) mod buffered;
9pub(crate) mod str;
10
11#[allow(clippy::module_name_repetitions)]
12pub use buffered::BufferedInput;
13
14/// A trait for inputs that can provide borrowed slices with a specific lifetime.
15///
16/// This trait enables zero-copy (`Cow::Borrowed`) token values for inputs that keep a stable
17/// backing string. The key difference from [`Input::slice_bytes`] is that this method returns
18/// a slice with the input's original lifetime `'a`, not tied to `&self`.
19///
20/// For inputs that support zero-copy (like [`str::StrInput`]), this returns `Some(&'a str)`.
21/// For streaming inputs that don't have stable backing storage, this returns `None`.
22pub trait BorrowedInput<'a>: Input {
23    /// Return a borrowed slice of the underlying source between two byte offsets.
24    ///
25    /// Unlike [`Input::slice_bytes`], this returns a slice with the input's lifetime `'a`,
26    /// allowing the slice to outlive the borrow of `&self`.
27    ///
28    /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
29    /// half-open: `[start, end)`.
30    ///
31    /// Returns `None` if the input does not support zero-copy slicing.
32    #[must_use]
33    fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str>;
34}
35
36pub use crate::char_traits::{
37    is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
38};
39
40/// Interface for a source of characters.
41///
42/// Hiding the input's implementation behind this trait allows input-specific optimizations, such
43/// as using `str` methods instead of manually transferring one `char` at a time to a buffer.
44/// Implementations with stable backing storage can also return borrowed `&str` slices and avoid
45/// allocating token values.
46pub trait Input {
47    /// A hint to the input source that we will need to read `count` characters.
48    ///
49    /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
50    /// The characters must not be consumed, but may be placed in an internal buffer.
51    ///
52    /// This method may be a no-op if buffering yields no performance improvement.
53    ///
54    /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
55    /// parser tracks how many characters are loaded in the buffer and acts accordingly.
56    fn lookahead(&mut self, count: usize);
57
58    /// Return the number of buffered characters in `self`.
59    #[must_use]
60    fn buflen(&self) -> usize;
61
62    /// Return the maximum number of characters this input can buffer for lookahead.
63    #[must_use]
64    fn bufmaxlen(&self) -> usize;
65
66    /// Return whether the lookahead buffer is empty.
67    #[inline]
68    #[must_use]
69    fn buf_is_empty(&self) -> bool {
70        self.buflen() == 0
71    }
72
73    /// Read a character from the input stream and return it directly.
74    ///
75    /// The internal buffer (if any) is bypassed.
76    #[must_use]
77    fn raw_read_ch(&mut self) -> char;
78
79    /// Read a non-breakz character from the input stream and return it directly.
80    ///
81    /// The internal buffer (if any) is bypassed.
82    ///
83    /// If the next character is a breakz, it is either not consumed or placed into the buffer (if
84    /// any).
85    #[must_use]
86    fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
87
88    /// Consume the next character.
89    fn skip(&mut self);
90
91    /// Consume the next `count` characters.
92    fn skip_n(&mut self, count: usize);
93
94    /// Return the next character, without consuming it.
95    ///
96    /// Users of the [`Input`] must make sure that the character has been loaded through a prior
97    /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
98    /// [`Input::lookahead`] has been made beforehand.
99    ///
100    /// # Return
101    /// If the input source is not exhausted, returns the next character to be fed into the
102    /// scanner. Otherwise, returns `\0`.
103    #[must_use]
104    fn peek(&self) -> char;
105
106    /// Return the `n`-th character in the buffer, without consuming it.
107    ///
108    /// This function assumes that the `n`-th character in the input has already been fetched through
109    /// [`Input::lookahead`].
110    #[must_use]
111    fn peek_nth(&self, n: usize) -> char;
112
113    /// Return the current byte offset in the underlying source, if available.
114    ///
115    /// This is an *optional* capability that enables zero-copy (`Cow::Borrowed`) token values
116    /// for inputs that keep a stable backing string (notably [`str::StrInput`]).
117    ///
118    /// The returned value (when `Some`) is the number of bytes that have been consumed so far,
119    /// i.e. an offset into the original source string.
120    ///
121    /// # Correctness contract
122    /// Implementations returning `Some(_)` must satisfy all of the following:
123    ///
124    /// - The offset is a valid UTF-8 boundary in the underlying source.
125    /// - The offset is monotonically non-decreasing as characters are consumed.
126    /// - The underlying source is stable for the duration of parsing (no reallocation/mutation)
127    ///   so that slices returned by [`Input::slice_bytes`] remain valid.
128    ///
129    /// Inputs that cannot provide stable slicing (e.g. stream/iterator inputs) must return
130    /// `None`.
131    #[inline]
132    #[must_use]
133    fn byte_offset(&self) -> Option<usize> {
134        None
135    }
136
137    /// Return a borrowed slice of the underlying source between two byte offsets.
138    ///
139    /// This is an *optional* capability used to produce `Cow::Borrowed` values without
140    /// allocating.
141    ///
142    /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
143    /// half-open: `[start, end)`.
144    ///
145    /// # Correctness contract
146    /// Implementations returning `Some(&str)` must ensure:
147    ///
148    /// - `start <= end`.
149    /// - Both offsets are valid UTF-8 boundaries.
150    /// - The returned `&str` is a view into the stable underlying source associated with this
151    ///   input.
152    ///
153    /// Implementations that return `None` from [`Input::byte_offset`] must also return `None`
154    /// here.
155    #[inline]
156    #[must_use]
157    fn slice_bytes(&self, _start: usize, _end: usize) -> Option<&str> {
158        None
159    }
160
161    /// Return whether this input may contain a `#` character.
162    ///
163    /// This is a conservative performance hint. Inputs that cannot answer cheaply should return
164    /// `true`, which keeps full comment handling enabled.
165    #[inline]
166    #[must_use]
167    fn may_contain_comments(&self) -> bool {
168        true
169    }
170
171    /// Look for the next character and return it.
172    ///
173    /// The character is not consumed.
174    /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
175    #[inline]
176    #[must_use]
177    fn look_ch(&mut self) -> char {
178        self.lookahead(1);
179        self.peek()
180    }
181
182    /// Return whether the next character in the input source is equal to `c`.
183    ///
184    /// This function assumes that the next character in the input has already been fetched through
185    /// [`Input::lookahead`].
186    #[inline]
187    #[must_use]
188    fn next_char_is(&self, c: char) -> bool {
189        self.peek() == c
190    }
191
192    /// Return whether the `n`-th character in the input source is equal to `c`.
193    ///
194    /// This function assumes that the `n`-th character in the input has already been fetched through
195    /// [`Input::lookahead`].
196    #[inline]
197    #[must_use]
198    fn nth_char_is(&self, n: usize, c: char) -> bool {
199        self.peek_nth(n) == c
200    }
201
202    /// Return whether the next 2 characters in the input source match the given characters.
203    ///
204    /// This function assumes that the next 2 characters in the input have already been fetched
205    /// through [`Input::lookahead`].
206    #[inline]
207    #[must_use]
208    fn next_2_are(&self, c1: char, c2: char) -> bool {
209        assert!(self.buflen() >= 2);
210        self.peek() == c1 && self.peek_nth(1) == c2
211    }
212
213    /// Return whether the next 3 characters in the input source match the given characters.
214    ///
215    /// This function assumes that the next 3 characters in the input have already been fetched
216    /// through [`Input::lookahead`].
217    #[inline]
218    #[must_use]
219    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
220        assert!(self.buflen() >= 3);
221        self.peek() == c1 && self.peek_nth(1) == c2 && self.peek_nth(2) == c3
222    }
223
224    /// Check whether the next characters correspond to a document indicator.
225    ///
226    /// This function assumes that the next 4 characters in the input have already been fetched
227    /// through [`Input::lookahead`].
228    #[inline]
229    #[must_use]
230    fn next_is_document_indicator(&self) -> bool {
231        assert!(self.buflen() >= 4);
232        is_blank_or_breakz(self.peek_nth(3))
233            && (self.next_3_are('.', '.', '.') || self.next_3_are('-', '-', '-'))
234    }
235
236    /// Check whether the next characters correspond to a start of document.
237    ///
238    /// This function assumes that the next 4 characters in the input have already been fetched
239    /// through [`Input::lookahead`].
240    #[inline]
241    #[must_use]
242    fn next_is_document_start(&self) -> bool {
243        assert!(self.buflen() >= 4);
244        self.next_3_are('-', '-', '-') && is_blank_or_breakz(self.peek_nth(3))
245    }
246
247    /// Check whether the next characters correspond to an end of document.
248    ///
249    /// This function assumes that the next 4 characters in the input have already been fetched
250    /// through [`Input::lookahead`].
251    #[inline]
252    #[must_use]
253    fn next_is_document_end(&self) -> bool {
254        assert!(self.buflen() >= 4);
255        self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
256    }
257
258    /// Skip YAML whitespace up to the end of the current line.
259    ///
260    /// Inline comments are consumed only after at least one preceding YAML whitespace character.
261    ///
262    /// # Return
263    /// Return a tuple with the number of characters that were consumed and the result of skipping
264    /// whitespace. The number of characters returned can be used to advance the index and column,
265    /// since no end-of-line character will be consumed.
266    /// See [`SkipTabs`] for more details on the success variant.
267    ///
268    /// # Errors
269    /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
270    /// the first tuple element will contain the number of characters consumed prior to reaching
271    /// the `#`.
272    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
273        let mut encountered_tab = false;
274        let mut has_yaml_ws = false;
275        let mut chars_consumed = 0;
276        loop {
277            match self.look_ch() {
278                ' ' => {
279                    has_yaml_ws = true;
280                    self.skip();
281                }
282                '\t' if skip_tabs != SkipTabs::No => {
283                    encountered_tab = true;
284                    self.skip();
285                }
286                // YAML comments must be preceded by whitespace.
287                '#' if !encountered_tab && !has_yaml_ws => {
288                    return (
289                        chars_consumed,
290                        Err("comments must be separated from other tokens by whitespace"),
291                    );
292                }
293                '#' => {
294                    self.skip(); // Skip over '#'
295                    while !is_breakz(self.look_ch()) {
296                        self.skip();
297                        chars_consumed += 1;
298                    }
299                }
300                _ => break,
301            }
302            chars_consumed += 1;
303        }
304
305        (
306            chars_consumed,
307            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
308        )
309    }
310
311    /// Skip YAML blank characters, stopping before comments, line breaks, or other content.
312    ///
313    /// This is the comment-aware counterpart to [`Input::skip_ws_to_eol`]: it preserves a
314    /// following `#` for the scanner to tokenize while still letting input implementations batch
315    /// the common run of spaces and tabs.
316    ///
317    /// # Return
318    /// Returns the number of consumed characters and a [`SkipTabs::Result`] describing whether
319    /// tabs and valid YAML whitespace (` `) were encountered.
320    fn skip_ws_to_eol_blanks(&mut self, skip_tabs: SkipTabs) -> (usize, SkipTabs) {
321        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
322
323        let mut encountered_tab = false;
324        let mut has_yaml_ws = false;
325        let mut chars_consumed = 0;
326
327        loop {
328            match self.look_ch() {
329                ' ' => {
330                    has_yaml_ws = true;
331                    chars_consumed += 1;
332                    self.skip();
333                }
334                '\t' if skip_tabs != SkipTabs::No => {
335                    encountered_tab = true;
336                    chars_consumed += 1;
337                    self.skip();
338                }
339                _ => break,
340            }
341        }
342
343        (
344            chars_consumed,
345            SkipTabs::Result(encountered_tab, has_yaml_ws),
346        )
347    }
348
349    /// Check whether the next characters may be part of a plain scalar.
350    ///
351    /// This function assumes we are not given a blankz character.
352    #[allow(clippy::inline_always)]
353    #[inline(always)]
354    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
355        let nc = self.peek_nth(1);
356        match self.peek() {
357            // indicators can end a plain scalar, see 7.3.3. Plain Style
358            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
359            c if in_flow && is_flow(c) => false,
360            _ => true,
361        }
362    }
363
364    /// Check whether the next character is [a blank] or [a break].
365    ///
366    /// The character must have previously been fetched through [`lookahead`]
367    ///
368    /// # Return
369    /// Returns true if the character is [a blank] or [a break], false otherwise.
370    ///
371    /// [`lookahead`]: Input::lookahead
372    /// [a blank]: is_blank
373    /// [a break]: is_break
374    #[inline]
375    fn next_is_blank_or_break(&self) -> bool {
376        is_blank(self.peek()) || is_break(self.peek())
377    }
378
379    /// Check whether the next character is [a blank] or [a breakz].
380    ///
381    /// The character must have previously been fetched through [`lookahead`]
382    ///
383    /// # Return
384    /// Returns true if the character is [a blank] or [a break], false otherwise.
385    ///
386    /// [`lookahead`]: Input::lookahead
387    /// [a blank]: is_blank
388    /// [a breakz]: is_breakz
389    #[inline]
390    fn next_is_blank_or_breakz(&self) -> bool {
391        is_blank(self.peek()) || is_breakz(self.peek())
392    }
393
394    /// Check whether the next character is [a blank].
395    ///
396    /// The character must have previously been fetched through [`lookahead`]
397    ///
398    /// # Return
399    /// Returns true if the character is [a blank], false otherwise.
400    ///
401    /// [`lookahead`]: Input::lookahead
402    /// [a blank]: is_blank
403    #[inline]
404    fn next_is_blank(&self) -> bool {
405        is_blank(self.peek())
406    }
407
408    /// Check whether the next character is [a break].
409    ///
410    /// The character must have previously been fetched through [`lookahead`]
411    ///
412    /// # Return
413    /// Returns true if the character is [a break], false otherwise.
414    ///
415    /// [`lookahead`]: Input::lookahead
416    /// [a break]: is_break
417    #[inline]
418    fn next_is_break(&self) -> bool {
419        is_break(self.peek())
420    }
421
422    /// Check whether the next character is [a breakz].
423    ///
424    /// The character must have previously been fetched through [`lookahead`]
425    ///
426    /// # Return
427    /// Returns true if the character is [a breakz], false otherwise.
428    ///
429    /// [`lookahead`]: Input::lookahead
430    /// [a breakz]: is_breakz
431    #[inline]
432    fn next_is_breakz(&self) -> bool {
433        is_breakz(self.peek())
434    }
435
436    /// Check whether the next character is [a z].
437    ///
438    /// The character must have previously been fetched through [`lookahead`]
439    ///
440    /// # Return
441    /// Returns true if the character is [a z], false otherwise.
442    ///
443    /// [`lookahead`]: Input::lookahead
444    /// [a z]: is_z
445    #[inline]
446    fn next_is_z(&self) -> bool {
447        is_z(self.peek())
448    }
449
450    /// Check whether the next character is [a flow].
451    ///
452    /// The character must have previously been fetched through [`lookahead`]
453    ///
454    /// # Return
455    /// Returns true if the character is [a flow], false otherwise.
456    ///
457    /// [`lookahead`]: Input::lookahead
458    /// [a flow]: is_flow
459    #[inline]
460    fn next_is_flow(&self) -> bool {
461        is_flow(self.peek())
462    }
463
464    /// Check whether the next character is [a digit].
465    ///
466    /// The character must have previously been fetched through [`lookahead`]
467    ///
468    /// # Return
469    /// Returns true if the character is [a digit], false otherwise.
470    ///
471    /// [`lookahead`]: Input::lookahead
472    /// [a digit]: is_digit
473    #[inline]
474    fn next_is_digit(&self) -> bool {
475        is_digit(self.peek())
476    }
477
478    /// Check whether the next character is [a letter].
479    ///
480    /// The character must have previously been fetched through [`lookahead`]
481    ///
482    /// # Return
483    /// Returns true if the character is [a letter], false otherwise.
484    ///
485    /// [`lookahead`]: Input::lookahead
486    /// [a letter]: is_alpha
487    #[inline]
488    fn next_is_alpha(&self) -> bool {
489        is_alpha(self.peek())
490    }
491
492    /// Skip characters from the input until a [breakz] is found.
493    ///
494    /// The characters are consumed from the input.
495    ///
496    /// # Return
497    /// Return the number of characters that were consumed. The number of characters returned can
498    /// be used to advance the index and column, since no end-of-line character will be consumed.
499    ///
500    /// [breakz]: is_breakz
501    #[inline]
502    fn skip_while_non_breakz(&mut self) -> usize {
503        let mut count = 0;
504        while !is_breakz(self.look_ch()) {
505            count += 1;
506            self.skip();
507        }
508        count
509    }
510
511    /// Skip characters from the input while [blanks] are found.
512    ///
513    /// The characters are consumed from the input.
514    ///
515    /// # Return
516    /// Return the number of characters that were consumed. The number of characters returned can
517    /// be used to advance the index and column, since no end-of-line character will be consumed.
518    ///
519    /// [blanks]: is_blank
520    fn skip_while_blank(&mut self) -> usize {
521        let mut n_bytes = 0;
522        while is_blank(self.look_ch()) {
523            n_bytes += self.peek().len_utf8();
524            self.skip();
525        }
526        n_bytes
527    }
528
529    /// Fetch characters from the input while we encounter letters and store them in `out`.
530    ///
531    /// The characters are consumed from the input.
532    ///
533    /// # Return
534    /// Return the number of characters that were consumed. The number of characters returned can
535    /// be used to advance the index and column, since no end-of-line character will be consumed.
536    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
537        let mut n_bytes = 0;
538        while is_alpha(self.look_ch()) {
539            let c = self.peek();
540            n_bytes += c.len_utf8();
541            out.push(c);
542            self.skip();
543        }
544        n_bytes
545    }
546
547    /// Fetch characters as long as they satisfy `is_yaml_non_space(c)`.
548    ///
549    /// The characters are consumed from the input.
550    ///
551    /// # Return
552    /// Return the number of characters that were consumed. The number of characters returned can
553    /// be used to advance the index and column, since no end-of-line character will be consumed.
554    fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
555        let mut chars_consumed = 0;
556        loop {
557            let c = self.look_ch();
558            if !crate::char_traits::is_yaml_non_space(c) || is_z(c) {
559                break;
560            }
561            let c = self.peek();
562            out.push(c);
563            self.skip();
564            chars_consumed += 1;
565        }
566        chars_consumed
567    }
568
569    /// Fetch a chunk of plain scalar characters.
570    ///
571    /// This optimization method allows the input to batch process characters.
572    /// Returns (stopped, `chars_consumed`).
573    /// stopped is true if the chunk ended because of a non-plain-scalar character.
574    fn fetch_plain_scalar_chunk(
575        &mut self,
576        out: &mut String,
577        count: usize,
578        flow_level_gt_0: bool,
579    ) -> (bool, usize) {
580        let mut chars_consumed = 0;
581        for _ in 0..count {
582            self.lookahead(1);
583            if self.next_is_blank_or_breakz() || !self.next_can_be_plain_scalar(flow_level_gt_0) {
584                return (true, chars_consumed);
585            }
586            out.push(self.peek());
587            self.skip();
588            chars_consumed += 1;
589        }
590        (false, chars_consumed)
591    }
592}
593
594/// Behavior to adopt regarding treating tabs as whitespace.
595///
596/// Although tab is valid YAML whitespace, it does not always behave the same as a space.
597#[derive(Copy, Clone, Eq, PartialEq)]
598pub enum SkipTabs {
599    /// Skip all tabs as whitespace.
600    Yes,
601    /// Don't skip any tab. Return from the function when encountering one.
602    No,
603    /// Return value from the function.
604    Result(
605        /// Whether tabs were encountered.
606        bool,
607        /// Whether at least one valid YAML whitespace character has been encountered.
608        bool,
609    ),
610}
611
612impl SkipTabs {
613    /// Whether tabs were found while skipping whitespace.
614    ///
615    /// This function must be called after a call to `skip_ws_to_eol`.
616    #[must_use]
617    pub fn found_tabs(self) -> bool {
618        matches!(self, SkipTabs::Result(true, _))
619    }
620
621    /// Whether a valid YAML whitespace has been found in skipped-over content.
622    ///
623    /// This function must be called after a call to `skip_ws_to_eol`.
624    #[must_use]
625    pub fn has_valid_yaml_ws(self) -> bool {
626        matches!(self, SkipTabs::Result(_, true))
627    }
628}
629
630#[cfg(test)]
631mod tests {
632    use super::{Input, SkipTabs};
633
634    struct MinimalInput;
635
636    impl Input for MinimalInput {
637        fn lookahead(&mut self, _count: usize) {}
638
639        fn buflen(&self) -> usize {
640            0
641        }
642
643        fn bufmaxlen(&self) -> usize {
644            0
645        }
646
647        fn raw_read_ch(&mut self) -> char {
648            '\0'
649        }
650
651        fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
652            None
653        }
654
655        fn skip(&mut self) {}
656
657        fn skip_n(&mut self, _count: usize) {}
658
659        fn peek(&self) -> char {
660            '\0'
661        }
662
663        fn peek_nth(&self, _n: usize) -> char {
664            '\0'
665        }
666    }
667
668    #[test]
669    fn default_slice_bytes_returns_none() {
670        let mut input = MinimalInput;
671
672        input.lookahead(4);
673        assert_eq!(input.buflen(), 0);
674        assert_eq!(input.bufmaxlen(), 0);
675        assert_eq!(input.raw_read_ch(), '\0');
676        assert_eq!(input.raw_read_non_breakz_ch(), None);
677        input.skip();
678        input.skip_n(2);
679        assert_eq!(input.peek(), '\0');
680        assert_eq!(input.peek_nth(1), '\0');
681        assert_eq!(input.byte_offset(), None);
682        assert_eq!(input.slice_bytes(0, 0), None);
683    }
684
685    #[test]
686    fn default_skip_ws_to_eol_rejects_unseparated_comment() {
687        let mut input = super::buffered::BufferedInput::new("#comment\n".chars());
688
689        let (consumed, result) = input.skip_ws_to_eol(SkipTabs::Yes);
690
691        assert_eq!(consumed, 0);
692        assert_eq!(
693            result.err(),
694            Some("comments must be separated from other tokens by whitespace")
695        );
696        assert_eq!(input.peek(), '#');
697    }
698}