granit_parser/input.rs
1//! Utilities to create a source of input to the parser.
2//!
3//! [`Input`] must be implemented for the parser to fetch input. Make sure your needs aren't
4//! covered by the [`BufferedInput`].
5
6use alloc::string::String;
7
8pub(crate) mod buffered;
9pub(crate) mod str;
10
11#[allow(clippy::module_name_repetitions)]
12pub use buffered::BufferedInput;
13
14/// A trait for inputs that can provide borrowed slices with a specific lifetime.
15///
16/// This trait enables zero-copy (`Cow::Borrowed`) token values for inputs that keep a stable
17/// backing string. The key difference from [`Input::slice_bytes`] is that this method returns
18/// a slice with the input's original lifetime `'a`, not tied to `&self`.
19///
20/// For inputs that support zero-copy (like [`str::StrInput`]), this returns `Some(&'a str)`.
21/// For streaming inputs that don't have stable backing storage, this returns `None`.
22pub trait BorrowedInput<'a>: Input {
23 /// Return a borrowed slice of the underlying source between two byte offsets.
24 ///
25 /// Unlike [`Input::slice_bytes`], this returns a slice with the input's lifetime `'a`,
26 /// allowing the slice to outlive the borrow of `&self`.
27 ///
28 /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
29 /// half-open: `[start, end)`.
30 ///
31 /// Returns `None` if the input does not support zero-copy slicing.
32 #[must_use]
33 fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str>;
34}
35
36pub use crate::char_traits::{
37 is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
38};
39
40/// Interface for a source of characters.
41///
42/// Hiding the input's implementation behind this trait allows input-specific optimizations, such
43/// as using `str` methods instead of manually transferring one `char` at a time to a buffer.
44/// Implementations with stable backing storage can also return borrowed `&str` slices and avoid
45/// allocating token values.
46pub trait Input {
47 /// A hint to the input source that we will need to read `count` characters.
48 ///
49 /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
50 /// The characters must not be consumed, but may be placed in an internal buffer.
51 ///
52 /// This method may be a no-op if buffering yields no performance improvement.
53 ///
54 /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
55 /// parser tracks how many characters are loaded in the buffer and acts accordingly.
56 fn lookahead(&mut self, count: usize);
57
58 /// Return the number of buffered characters in `self`.
59 #[must_use]
60 fn buflen(&self) -> usize;
61
62 /// Return the maximum number of characters this input can buffer for lookahead.
63 #[must_use]
64 fn bufmaxlen(&self) -> usize;
65
66 /// Return whether the lookahead buffer is empty.
67 #[inline]
68 #[must_use]
69 fn buf_is_empty(&self) -> bool {
70 self.buflen() == 0
71 }
72
73 /// Read a character from the input stream and return it directly.
74 ///
75 /// The internal buffer (if any) is bypassed.
76 #[must_use]
77 fn raw_read_ch(&mut self) -> char;
78
79 /// Read a non-breakz character from the input stream and return it directly.
80 ///
81 /// The internal buffer (if any) is bypassed.
82 ///
83 /// If the next character is a breakz, it is either not consumed or placed into the buffer (if
84 /// any).
85 #[must_use]
86 fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
87
88 /// Consume the next character.
89 fn skip(&mut self);
90
91 /// Consume the next `count` characters.
92 fn skip_n(&mut self, count: usize);
93
94 /// Return the next character, without consuming it.
95 ///
96 /// Users of the [`Input`] must make sure that the character has been loaded through a prior
97 /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
98 /// [`Input::lookahead`] has been made beforehand.
99 ///
100 /// # Return
101 /// If the input source is not exhausted, returns the next character to be fed into the
102 /// scanner. Otherwise, returns `\0`.
103 #[must_use]
104 fn peek(&self) -> char;
105
106 /// Return the `n`-th character in the buffer, without consuming it.
107 ///
108 /// This function assumes that the `n`-th character in the input has already been fetched through
109 /// [`Input::lookahead`].
110 #[must_use]
111 fn peek_nth(&self, n: usize) -> char;
112
113 /// Return the current byte offset in the underlying source, if available.
114 ///
115 /// This is an *optional* capability that enables zero-copy (`Cow::Borrowed`) token values
116 /// for inputs that keep a stable backing string (notably [`str::StrInput`]).
117 ///
118 /// The returned value (when `Some`) is the number of bytes that have been consumed so far,
119 /// i.e. an offset into the original source string.
120 ///
121 /// # Correctness contract
122 /// Implementations returning `Some(_)` must satisfy all of the following:
123 ///
124 /// - The offset is a valid UTF-8 boundary in the underlying source.
125 /// - The offset is monotonically non-decreasing as characters are consumed.
126 /// - The underlying source is stable for the duration of parsing (no reallocation/mutation)
127 /// so that slices returned by [`Input::slice_bytes`] remain valid.
128 ///
129 /// Inputs that cannot provide stable slicing (e.g. stream/iterator inputs) must return
130 /// `None`.
131 #[inline]
132 #[must_use]
133 fn byte_offset(&self) -> Option<usize> {
134 None
135 }
136
137 /// Return a borrowed slice of the underlying source between two byte offsets.
138 ///
139 /// This is an *optional* capability used to produce `Cow::Borrowed` values without
140 /// allocating.
141 ///
142 /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
143 /// half-open: `[start, end)`.
144 ///
145 /// # Correctness contract
146 /// Implementations returning `Some(&str)` must ensure:
147 ///
148 /// - `start <= end`.
149 /// - Both offsets are valid UTF-8 boundaries.
150 /// - The returned `&str` is a view into the stable underlying source associated with this
151 /// input.
152 ///
153 /// Implementations that return `None` from [`Input::byte_offset`] must also return `None`
154 /// here.
155 #[inline]
156 #[must_use]
157 fn slice_bytes(&self, _start: usize, _end: usize) -> Option<&str> {
158 None
159 }
160
161 /// Return whether this input may contain a `#` character.
162 ///
163 /// This is a conservative performance hint. Inputs that cannot answer cheaply should return
164 /// `true`, which keeps full comment handling enabled.
165 #[inline]
166 #[must_use]
167 fn may_contain_comments(&self) -> bool {
168 true
169 }
170
171 /// Look for the next character and return it.
172 ///
173 /// The character is not consumed.
174 /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
175 #[inline]
176 #[must_use]
177 fn look_ch(&mut self) -> char {
178 self.lookahead(1);
179 self.peek()
180 }
181
182 /// Return whether the next character in the input source is equal to `c`.
183 ///
184 /// This function assumes that the next character in the input has already been fetched through
185 /// [`Input::lookahead`].
186 #[inline]
187 #[must_use]
188 fn next_char_is(&self, c: char) -> bool {
189 self.peek() == c
190 }
191
192 /// Return whether the `n`-th character in the input source is equal to `c`.
193 ///
194 /// This function assumes that the `n`-th character in the input has already been fetched through
195 /// [`Input::lookahead`].
196 #[inline]
197 #[must_use]
198 fn nth_char_is(&self, n: usize, c: char) -> bool {
199 self.peek_nth(n) == c
200 }
201
202 /// Return whether the next 2 characters in the input source match the given characters.
203 ///
204 /// This function assumes that the next 2 characters in the input have already been fetched
205 /// through [`Input::lookahead`].
206 #[inline]
207 #[must_use]
208 fn next_2_are(&self, c1: char, c2: char) -> bool {
209 assert!(self.buflen() >= 2);
210 self.peek() == c1 && self.peek_nth(1) == c2
211 }
212
213 /// Return whether the next 3 characters in the input source match the given characters.
214 ///
215 /// This function assumes that the next 3 characters in the input have already been fetched
216 /// through [`Input::lookahead`].
217 #[inline]
218 #[must_use]
219 fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
220 assert!(self.buflen() >= 3);
221 self.peek() == c1 && self.peek_nth(1) == c2 && self.peek_nth(2) == c3
222 }
223
224 /// Check whether the next characters correspond to a document indicator.
225 ///
226 /// This function assumes that the next 4 characters in the input have already been fetched
227 /// through [`Input::lookahead`].
228 #[inline]
229 #[must_use]
230 fn next_is_document_indicator(&self) -> bool {
231 assert!(self.buflen() >= 4);
232 is_blank_or_breakz(self.peek_nth(3))
233 && (self.next_3_are('.', '.', '.') || self.next_3_are('-', '-', '-'))
234 }
235
236 /// Check whether the next characters correspond to a start of document.
237 ///
238 /// This function assumes that the next 4 characters in the input have already been fetched
239 /// through [`Input::lookahead`].
240 #[inline]
241 #[must_use]
242 fn next_is_document_start(&self) -> bool {
243 assert!(self.buflen() >= 4);
244 self.next_3_are('-', '-', '-') && is_blank_or_breakz(self.peek_nth(3))
245 }
246
247 /// Check whether the next characters correspond to an end of document.
248 ///
249 /// This function assumes that the next 4 characters in the input have already been fetched
250 /// through [`Input::lookahead`].
251 #[inline]
252 #[must_use]
253 fn next_is_document_end(&self) -> bool {
254 assert!(self.buflen() >= 4);
255 self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
256 }
257
258 /// Skip YAML whitespace up to the end of the current line.
259 ///
260 /// Inline comments are consumed only after at least one preceding YAML whitespace character.
261 ///
262 /// # Return
263 /// Return a tuple with the number of characters that were consumed and the result of skipping
264 /// whitespace. The number of characters returned can be used to advance the index and column,
265 /// since no end-of-line character will be consumed.
266 /// See [`SkipTabs`] for more details on the success variant.
267 ///
268 /// # Errors
269 /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
270 /// the first tuple element will contain the number of characters consumed prior to reaching
271 /// the `#`.
272 fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
273 let mut encountered_tab = false;
274 let mut has_yaml_ws = false;
275 let mut chars_consumed = 0;
276 loop {
277 match self.look_ch() {
278 ' ' => {
279 has_yaml_ws = true;
280 self.skip();
281 }
282 '\t' if skip_tabs != SkipTabs::No => {
283 encountered_tab = true;
284 self.skip();
285 }
286 // YAML comments must be preceded by whitespace.
287 '#' if !encountered_tab && !has_yaml_ws => {
288 return (
289 chars_consumed,
290 Err("comments must be separated from other tokens by whitespace"),
291 );
292 }
293 '#' => {
294 self.skip(); // Skip over '#'
295 while !is_breakz(self.look_ch()) {
296 self.skip();
297 chars_consumed += 1;
298 }
299 }
300 _ => break,
301 }
302 chars_consumed += 1;
303 }
304
305 (
306 chars_consumed,
307 Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
308 )
309 }
310
311 /// Skip YAML blank characters, stopping before comments, line breaks, or other content.
312 ///
313 /// This is the comment-aware counterpart to [`Input::skip_ws_to_eol`]: it preserves a
314 /// following `#` for the scanner to tokenize while still letting input implementations batch
315 /// the common run of spaces and tabs.
316 ///
317 /// # Return
318 /// Returns the number of consumed characters and a [`SkipTabs::Result`] describing whether
319 /// tabs and valid YAML whitespace (` `) were encountered.
320 fn skip_ws_to_eol_blanks(&mut self, skip_tabs: SkipTabs) -> (usize, SkipTabs) {
321 assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
322
323 let mut encountered_tab = false;
324 let mut has_yaml_ws = false;
325 let mut chars_consumed = 0;
326
327 loop {
328 match self.look_ch() {
329 ' ' => {
330 has_yaml_ws = true;
331 chars_consumed += 1;
332 self.skip();
333 }
334 '\t' if skip_tabs != SkipTabs::No => {
335 encountered_tab = true;
336 chars_consumed += 1;
337 self.skip();
338 }
339 _ => break,
340 }
341 }
342
343 (
344 chars_consumed,
345 SkipTabs::Result(encountered_tab, has_yaml_ws),
346 )
347 }
348
349 /// Check whether the next characters may be part of a plain scalar.
350 ///
351 /// This function assumes we are not given a blankz character.
352 #[allow(clippy::inline_always)]
353 #[inline(always)]
354 fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
355 let nc = self.peek_nth(1);
356 match self.peek() {
357 // indicators can end a plain scalar, see 7.3.3. Plain Style
358 ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
359 c if in_flow && is_flow(c) => false,
360 _ => true,
361 }
362 }
363
364 /// Check whether the next character is [a blank] or [a break].
365 ///
366 /// The character must have previously been fetched through [`lookahead`]
367 ///
368 /// # Return
369 /// Returns true if the character is [a blank] or [a break], false otherwise.
370 ///
371 /// [`lookahead`]: Input::lookahead
372 /// [a blank]: is_blank
373 /// [a break]: is_break
374 #[inline]
375 fn next_is_blank_or_break(&self) -> bool {
376 is_blank(self.peek()) || is_break(self.peek())
377 }
378
379 /// Check whether the next character is [a blank] or [a breakz].
380 ///
381 /// The character must have previously been fetched through [`lookahead`]
382 ///
383 /// # Return
384 /// Returns true if the character is [a blank] or [a break], false otherwise.
385 ///
386 /// [`lookahead`]: Input::lookahead
387 /// [a blank]: is_blank
388 /// [a breakz]: is_breakz
389 #[inline]
390 fn next_is_blank_or_breakz(&self) -> bool {
391 is_blank(self.peek()) || is_breakz(self.peek())
392 }
393
394 /// Check whether the next character is [a blank].
395 ///
396 /// The character must have previously been fetched through [`lookahead`]
397 ///
398 /// # Return
399 /// Returns true if the character is [a blank], false otherwise.
400 ///
401 /// [`lookahead`]: Input::lookahead
402 /// [a blank]: is_blank
403 #[inline]
404 fn next_is_blank(&self) -> bool {
405 is_blank(self.peek())
406 }
407
408 /// Check whether the next character is [a break].
409 ///
410 /// The character must have previously been fetched through [`lookahead`]
411 ///
412 /// # Return
413 /// Returns true if the character is [a break], false otherwise.
414 ///
415 /// [`lookahead`]: Input::lookahead
416 /// [a break]: is_break
417 #[inline]
418 fn next_is_break(&self) -> bool {
419 is_break(self.peek())
420 }
421
422 /// Check whether the next character is [a breakz].
423 ///
424 /// The character must have previously been fetched through [`lookahead`]
425 ///
426 /// # Return
427 /// Returns true if the character is [a breakz], false otherwise.
428 ///
429 /// [`lookahead`]: Input::lookahead
430 /// [a breakz]: is_breakz
431 #[inline]
432 fn next_is_breakz(&self) -> bool {
433 is_breakz(self.peek())
434 }
435
436 /// Check whether the next character is [a z].
437 ///
438 /// The character must have previously been fetched through [`lookahead`]
439 ///
440 /// # Return
441 /// Returns true if the character is [a z], false otherwise.
442 ///
443 /// [`lookahead`]: Input::lookahead
444 /// [a z]: is_z
445 #[inline]
446 fn next_is_z(&self) -> bool {
447 is_z(self.peek())
448 }
449
450 /// Check whether the next character is [a flow].
451 ///
452 /// The character must have previously been fetched through [`lookahead`]
453 ///
454 /// # Return
455 /// Returns true if the character is [a flow], false otherwise.
456 ///
457 /// [`lookahead`]: Input::lookahead
458 /// [a flow]: is_flow
459 #[inline]
460 fn next_is_flow(&self) -> bool {
461 is_flow(self.peek())
462 }
463
464 /// Check whether the next character is [a digit].
465 ///
466 /// The character must have previously been fetched through [`lookahead`]
467 ///
468 /// # Return
469 /// Returns true if the character is [a digit], false otherwise.
470 ///
471 /// [`lookahead`]: Input::lookahead
472 /// [a digit]: is_digit
473 #[inline]
474 fn next_is_digit(&self) -> bool {
475 is_digit(self.peek())
476 }
477
478 /// Check whether the next character is [a letter].
479 ///
480 /// The character must have previously been fetched through [`lookahead`]
481 ///
482 /// # Return
483 /// Returns true if the character is [a letter], false otherwise.
484 ///
485 /// [`lookahead`]: Input::lookahead
486 /// [a letter]: is_alpha
487 #[inline]
488 fn next_is_alpha(&self) -> bool {
489 is_alpha(self.peek())
490 }
491
492 /// Skip characters from the input until a [breakz] is found.
493 ///
494 /// The characters are consumed from the input.
495 ///
496 /// # Return
497 /// Return the number of characters that were consumed. The number of characters returned can
498 /// be used to advance the index and column, since no end-of-line character will be consumed.
499 ///
500 /// [breakz]: is_breakz
501 #[inline]
502 fn skip_while_non_breakz(&mut self) -> usize {
503 let mut count = 0;
504 while !is_breakz(self.look_ch()) {
505 count += 1;
506 self.skip();
507 }
508 count
509 }
510
511 /// Skip characters from the input while [blanks] are found.
512 ///
513 /// The characters are consumed from the input.
514 ///
515 /// # Return
516 /// Return the number of characters that were consumed. The number of characters returned can
517 /// be used to advance the index and column, since no end-of-line character will be consumed.
518 ///
519 /// [blanks]: is_blank
520 fn skip_while_blank(&mut self) -> usize {
521 let mut n_bytes = 0;
522 while is_blank(self.look_ch()) {
523 n_bytes += self.peek().len_utf8();
524 self.skip();
525 }
526 n_bytes
527 }
528
529 /// Fetch characters from the input while we encounter letters and store them in `out`.
530 ///
531 /// The characters are consumed from the input.
532 ///
533 /// # Return
534 /// Return the number of characters that were consumed. The number of characters returned can
535 /// be used to advance the index and column, since no end-of-line character will be consumed.
536 fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
537 let mut n_bytes = 0;
538 while is_alpha(self.look_ch()) {
539 let c = self.peek();
540 n_bytes += c.len_utf8();
541 out.push(c);
542 self.skip();
543 }
544 n_bytes
545 }
546
547 /// Fetch characters as long as they satisfy `is_yaml_non_space(c)`.
548 ///
549 /// The characters are consumed from the input.
550 ///
551 /// # Return
552 /// Return the number of characters that were consumed. The number of characters returned can
553 /// be used to advance the index and column, since no end-of-line character will be consumed.
554 fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
555 let mut chars_consumed = 0;
556 loop {
557 let c = self.look_ch();
558 if !crate::char_traits::is_yaml_non_space(c) || is_z(c) {
559 break;
560 }
561 let c = self.peek();
562 out.push(c);
563 self.skip();
564 chars_consumed += 1;
565 }
566 chars_consumed
567 }
568
569 /// Fetch a chunk of plain scalar characters.
570 ///
571 /// This optimization method allows the input to batch process characters.
572 /// Returns (stopped, `chars_consumed`).
573 /// stopped is true if the chunk ended because of a non-plain-scalar character.
574 fn fetch_plain_scalar_chunk(
575 &mut self,
576 out: &mut String,
577 count: usize,
578 flow_level_gt_0: bool,
579 ) -> (bool, usize) {
580 let mut chars_consumed = 0;
581 for _ in 0..count {
582 self.lookahead(1);
583 if self.next_is_blank_or_breakz() || !self.next_can_be_plain_scalar(flow_level_gt_0) {
584 return (true, chars_consumed);
585 }
586 out.push(self.peek());
587 self.skip();
588 chars_consumed += 1;
589 }
590 (false, chars_consumed)
591 }
592}
593
594/// Behavior to adopt regarding treating tabs as whitespace.
595///
596/// Although tab is valid YAML whitespace, it does not always behave the same as a space.
597#[derive(Copy, Clone, Eq, PartialEq)]
598pub enum SkipTabs {
599 /// Skip all tabs as whitespace.
600 Yes,
601 /// Don't skip any tab. Return from the function when encountering one.
602 No,
603 /// Return value from the function.
604 Result(
605 /// Whether tabs were encountered.
606 bool,
607 /// Whether at least one valid YAML whitespace character has been encountered.
608 bool,
609 ),
610}
611
612impl SkipTabs {
613 /// Whether tabs were found while skipping whitespace.
614 ///
615 /// This function must be called after a call to `skip_ws_to_eol`.
616 #[must_use]
617 pub fn found_tabs(self) -> bool {
618 matches!(self, SkipTabs::Result(true, _))
619 }
620
621 /// Whether a valid YAML whitespace has been found in skipped-over content.
622 ///
623 /// This function must be called after a call to `skip_ws_to_eol`.
624 #[must_use]
625 pub fn has_valid_yaml_ws(self) -> bool {
626 matches!(self, SkipTabs::Result(_, true))
627 }
628}
629
630#[cfg(test)]
631mod tests {
632 use super::{Input, SkipTabs};
633
634 struct MinimalInput;
635
636 impl Input for MinimalInput {
637 fn lookahead(&mut self, _count: usize) {}
638
639 fn buflen(&self) -> usize {
640 0
641 }
642
643 fn bufmaxlen(&self) -> usize {
644 0
645 }
646
647 fn raw_read_ch(&mut self) -> char {
648 '\0'
649 }
650
651 fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
652 None
653 }
654
655 fn skip(&mut self) {}
656
657 fn skip_n(&mut self, _count: usize) {}
658
659 fn peek(&self) -> char {
660 '\0'
661 }
662
663 fn peek_nth(&self, _n: usize) -> char {
664 '\0'
665 }
666 }
667
668 #[test]
669 fn default_slice_bytes_returns_none() {
670 let mut input = MinimalInput;
671
672 input.lookahead(4);
673 assert_eq!(input.buflen(), 0);
674 assert_eq!(input.bufmaxlen(), 0);
675 assert_eq!(input.raw_read_ch(), '\0');
676 assert_eq!(input.raw_read_non_breakz_ch(), None);
677 input.skip();
678 input.skip_n(2);
679 assert_eq!(input.peek(), '\0');
680 assert_eq!(input.peek_nth(1), '\0');
681 assert_eq!(input.byte_offset(), None);
682 assert_eq!(input.slice_bytes(0, 0), None);
683 }
684
685 #[test]
686 fn default_skip_ws_to_eol_rejects_unseparated_comment() {
687 let mut input = super::buffered::BufferedInput::new("#comment\n".chars());
688
689 let (consumed, result) = input.skip_ws_to_eol(SkipTabs::Yes);
690
691 assert_eq!(consumed, 0);
692 assert_eq!(
693 result.err(),
694 Some("comments must be separated from other tokens by whitespace")
695 );
696 assert_eq!(input.peek(), '#');
697 }
698}