Skip to main content

saphyr_parser_bw/
input.rs

1//! Utilities to create a source of input to the parser.
2//!
3//! [`Input`] must be implemented for the parser to fetch input. Make sure your needs aren't
4//! covered by the [`BufferedInput`].
5
6use alloc::string::String;
7
8pub(crate) mod buffered;
9pub(crate) mod str;
10
11#[allow(clippy::module_name_repetitions)]
12pub use buffered::BufferedInput;
13
14/// A trait for inputs that can provide borrowed slices with a specific lifetime.
15///
16/// This trait enables zero-copy (`Cow::Borrowed`) token values for inputs that keep a stable
17/// backing string. The key difference from [`Input::slice_bytes`] is that this method returns
18/// a slice with the input's original lifetime `'a`, not tied to `&self`.
19///
20/// For inputs that support zero-copy (like [`str::StrInput`]), this returns `Some(&'a str)`.
21/// For streaming inputs that don't have stable backing storage, this returns `None`.
22pub trait BorrowedInput<'a>: Input {
23    /// Return a borrowed slice of the underlying source between two byte offsets.
24    ///
25    /// Unlike [`Input::slice_bytes`], this returns a slice with the input's lifetime `'a`,
26    /// allowing the slice to outlive the borrow of `&self`.
27    ///
28    /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
29    /// half-open: `[start, end)`.
30    ///
31    /// Returns `None` if the input does not support zero-copy slicing.
32    #[must_use]
33    fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str>;
34}
35
36pub use crate::char_traits::{
37    is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
38};
39
40/// Interface for a source of characters.
41///
42/// Hiding the input's implementation behind this trait allows mostly:
43///  * For input-specific optimizations (for instance, using `str` methods instead of manually
44///    transferring one `char` at a time to a buffer).
45///  * To return `&str`s referencing the input string, thus avoiding potentially costly
46///    allocations. Should users need an owned version of the data, they can always `.to_owned()`
47///    their YAML object.
48pub trait Input {
49    /// A hint to the input source that we will need to read `count` characters.
50    ///
51    /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
52    /// The characters must not be consumed, but may be placed in an internal buffer.
53    ///
54    /// This method may be a no-op if buffering yields no performance improvement.
55    ///
56    /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
57    /// parser tracks how many characters are loaded in the buffer and acts accordingly.
58    fn lookahead(&mut self, count: usize);
59
60    /// Return the number of buffered characters in `self`.
61    #[must_use]
62    fn buflen(&self) -> usize;
63
64    /// Return the capacity of the buffer in `self`.
65    #[must_use]
66    fn bufmaxlen(&self) -> usize;
67
68    /// Return whether the buffer (!= stream) is empty.
69    #[inline]
70    #[must_use]
71    fn buf_is_empty(&self) -> bool {
72        self.buflen() == 0
73    }
74
75    /// Read a character from the input stream and return it directly.
76    ///
77    /// The internal buffer (if any) is bypassed.
78    #[must_use]
79    fn raw_read_ch(&mut self) -> char;
80
81    /// Read a non-breakz a character from the input stream and return it directly.
82    ///
83    /// The internal buffer (if any) is bypassed.
84    ///
85    /// If the next character is a breakz, it is either not consumed or placed into the buffer (if
86    /// any).
87    #[must_use]
88    fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
89
90    /// Consume the next character.
91    fn skip(&mut self);
92
93    /// Consume the next `count` character.
94    fn skip_n(&mut self, count: usize);
95
96    /// Return the next character, without consuming it.
97    ///
98    /// Users of the [`Input`] must make sure that the character has been loaded through a prior
99    /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
100    /// [`Input::lookahead`] has been made beforehand.
101    ///
102    /// # Return
103    /// If the input source is not exhausted, returns the next character to be fed into the
104    /// scanner. Otherwise, returns `\0`.
105    #[must_use]
106    fn peek(&self) -> char;
107
108    /// Return the `n`-th character in the buffer, without consuming it.
109    ///
110    /// This function assumes that the n-th character in the input has already been fetched through
111    /// [`Input::lookahead`].
112    #[must_use]
113    fn peek_nth(&self, n: usize) -> char;
114
115    /// Return the current byte offset in the underlying source, if available.
116    ///
117    /// This is an *optional* capability that enables zero-copy (`Cow::Borrowed`) token values
118    /// for inputs that keep a stable backing string (notably [`str::StrInput`]).
119    ///
120    /// The returned value (when `Some`) is the number of bytes that have been consumed so far,
121    /// i.e. an offset into the original source string.
122    ///
123    /// # Correctness contract
124    /// Implementations returning `Some(_)` must satisfy all of the following:
125    ///
126    /// - The offset is a valid UTF-8 boundary in the underlying source.
127    /// - The offset is monotonically non-decreasing as characters are consumed.
128    /// - The underlying source is stable for the duration of parsing (no reallocation/mutation)
129    ///   so that slices returned by [`Input::slice_bytes`] remain valid.
130    ///
131    /// Inputs that cannot provide stable slicing (e.g. stream/iterator inputs) must return
132    /// `None`.
133    #[inline]
134    #[must_use]
135    fn byte_offset(&self) -> Option<usize> {
136        None
137    }
138
139    /// Return a borrowed slice of the underlying source between two byte offsets.
140    ///
141    /// This is an *optional* capability used to produce `Cow::Borrowed` values without
142    /// allocating.
143    ///
144    /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
145    /// half-open: `[start, end)`.
146    ///
147    /// # Correctness contract
148    /// Implementations returning `Some(&str)` must ensure:
149    ///
150    /// - `start <= end`.
151    /// - Both offsets are valid UTF-8 boundaries.
152    /// - The returned `&str` is a view into the stable underlying source associated with this
153    ///   input.
154    ///
155    /// Implementations that return `None` from [`Input::byte_offset`] must also return `None`
156    /// here.
157    #[inline]
158    #[must_use]
159    fn slice_bytes(&self, _start: usize, _end: usize) -> Option<&str> {
160        None
161    }
162
163    /// Look for the next character and return it.
164    ///
165    /// The character is not consumed.
166    /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
167    #[inline]
168    #[must_use]
169    fn look_ch(&mut self) -> char {
170        self.lookahead(1);
171        self.peek()
172    }
173
174    /// Return whether the next character in the input source is equal to `c`.
175    ///
176    /// This function assumes that the next character in the input has already been fetched through
177    /// [`Input::lookahead`].
178    #[inline]
179    #[must_use]
180    fn next_char_is(&self, c: char) -> bool {
181        self.peek() == c
182    }
183
184    /// Return whether the `n`-th character in the input source is equal to `c`.
185    ///
186    /// This function assumes that the n-th character in the input has already been fetched through
187    /// [`Input::lookahead`].
188    #[inline]
189    #[must_use]
190    fn nth_char_is(&self, n: usize, c: char) -> bool {
191        self.peek_nth(n) == c
192    }
193
194    /// Return whether the next 2 characters in the input source match the given characters.
195    ///
196    /// This function assumes that the next 2 characters in the input have already been fetched
197    /// through [`Input::lookahead`].
198    #[inline]
199    #[must_use]
200    fn next_2_are(&self, c1: char, c2: char) -> bool {
201        assert!(self.buflen() >= 2);
202        self.peek() == c1 && self.peek_nth(1) == c2
203    }
204
205    /// Return whether the next 3 characters in the input source match the given characters.
206    ///
207    /// This function assumes that the next 3 characters in the input have already been fetched
208    /// through [`Input::lookahead`].
209    #[inline]
210    #[must_use]
211    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
212        assert!(self.buflen() >= 3);
213        self.peek() == c1 && self.peek_nth(1) == c2 && self.peek_nth(2) == c3
214    }
215
216    /// Check whether the next characters correspond to a document indicator.
217    ///
218    /// This function assumes that the next 4 characters in the input has already been fetched
219    /// through [`Input::lookahead`].
220    #[inline]
221    #[must_use]
222    fn next_is_document_indicator(&self) -> bool {
223        assert!(self.buflen() >= 4);
224        is_blank_or_breakz(self.peek_nth(3))
225            && (self.next_3_are('.', '.', '.') || self.next_3_are('-', '-', '-'))
226    }
227
228    /// Check whether the next characters correspond to a start of document.
229    ///
230    /// This function assumes that the next 4 characters in the input has already been fetched
231    /// through [`Input::lookahead`].
232    #[inline]
233    #[must_use]
234    fn next_is_document_start(&self) -> bool {
235        assert!(self.buflen() >= 4);
236        self.next_3_are('-', '-', '-') && is_blank_or_breakz(self.peek_nth(3))
237    }
238
239    /// Check whether the next characters correspond to an end of document.
240    ///
241    /// This function assumes that the next 4 characters in the input has already been fetched
242    /// through [`Input::lookahead`].
243    #[inline]
244    #[must_use]
245    fn next_is_document_end(&self) -> bool {
246        assert!(self.buflen() >= 4);
247        self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
248    }
249
250    /// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input.
251    ///
252    /// # Return
253    /// Return a tuple with the number of characters that were consumed and the result of skipping
254    /// whitespace. The number of characters returned can be used to advance the index and column,
255    /// since no end-of-line character will be consumed.
256    /// See [`SkipTabs`] For more details on the success variant.
257    ///
258    /// # Errors
259    /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
260    /// the first tuple element will contain the number of characters consumed prior to reaching
261    /// the `#`.
262    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
263        let mut encountered_tab = false;
264        let mut has_yaml_ws = false;
265        let mut chars_consumed = 0;
266        loop {
267            match self.look_ch() {
268                ' ' => {
269                    has_yaml_ws = true;
270                    self.skip();
271                }
272                '\t' if skip_tabs != SkipTabs::No => {
273                    encountered_tab = true;
274                    self.skip();
275                }
276                // YAML comments must be preceded by whitespace.
277                '#' if !encountered_tab && !has_yaml_ws => {
278                    return (
279                        chars_consumed,
280                        Err("comments must be separated from other tokens by whitespace"),
281                    );
282                }
283                '#' => {
284                    self.skip(); // Skip over '#'
285                    while !is_breakz(self.look_ch()) {
286                        self.skip();
287                        chars_consumed += 1;
288                    }
289                }
290                _ => break,
291            }
292            chars_consumed += 1;
293        }
294
295        (
296            chars_consumed,
297            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
298        )
299    }
300
301    /// Check whether the next characters may be part of a plain scalar.
302    ///
303    /// This function assumes we are not given a blankz character.
304    #[allow(clippy::inline_always)]
305    #[inline(always)]
306    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
307        let nc = self.peek_nth(1);
308        match self.peek() {
309            // indicators can end a plain scalar, see 7.3.3. Plain Style
310            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
311            c if in_flow && is_flow(c) => false,
312            _ => true,
313        }
314    }
315
316    /// Check whether the next character is [a blank] or [a break].
317    ///
318    /// The character must have previously been fetched through [`lookahead`]
319    ///
320    /// # Return
321    /// Returns true if the character is [a blank] or [a break], false otherwise.
322    ///
323    /// [`lookahead`]: Input::lookahead
324    /// [a blank]: is_blank
325    /// [a break]: is_break
326    #[inline]
327    fn next_is_blank_or_break(&self) -> bool {
328        is_blank(self.peek()) || is_break(self.peek())
329    }
330
331    /// Check whether the next character is [a blank] or [a breakz].
332    ///
333    /// The character must have previously been fetched through [`lookahead`]
334    ///
335    /// # Return
336    /// Returns true if the character is [a blank] or [a break], false otherwise.
337    ///
338    /// [`lookahead`]: Input::lookahead
339    /// [a blank]: is_blank
340    /// [a breakz]: is_breakz
341    #[inline]
342    fn next_is_blank_or_breakz(&self) -> bool {
343        is_blank(self.peek()) || is_breakz(self.peek())
344    }
345
346    /// Check whether the next character is [a blank].
347    ///
348    /// The character must have previously been fetched through [`lookahead`]
349    ///
350    /// # Return
351    /// Returns true if the character is [a blank], false otherwise.
352    ///
353    /// [`lookahead`]: Input::lookahead
354    /// [a blank]: is_blank
355    #[inline]
356    fn next_is_blank(&self) -> bool {
357        is_blank(self.peek())
358    }
359
360    /// Check whether the next character is [a break].
361    ///
362    /// The character must have previously been fetched through [`lookahead`]
363    ///
364    /// # Return
365    /// Returns true if the character is [a break], false otherwise.
366    ///
367    /// [`lookahead`]: Input::lookahead
368    /// [a break]: is_break
369    #[inline]
370    fn next_is_break(&self) -> bool {
371        is_break(self.peek())
372    }
373
374    /// Check whether the next character is [a breakz].
375    ///
376    /// The character must have previously been fetched through [`lookahead`]
377    ///
378    /// # Return
379    /// Returns true if the character is [a breakz], false otherwise.
380    ///
381    /// [`lookahead`]: Input::lookahead
382    /// [a breakz]: is_breakz
383    #[inline]
384    fn next_is_breakz(&self) -> bool {
385        is_breakz(self.peek())
386    }
387
388    /// Check whether the next character is [a z].
389    ///
390    /// The character must have previously been fetched through [`lookahead`]
391    ///
392    /// # Return
393    /// Returns true if the character is [a z], false otherwise.
394    ///
395    /// [`lookahead`]: Input::lookahead
396    /// [a z]: is_z
397    #[inline]
398    fn next_is_z(&self) -> bool {
399        is_z(self.peek())
400    }
401
402    /// Check whether the next character is [a flow].
403    ///
404    /// The character must have previously been fetched through [`lookahead`]
405    ///
406    /// # Return
407    /// Returns true if the character is [a flow], false otherwise.
408    ///
409    /// [`lookahead`]: Input::lookahead
410    /// [a flow]: is_flow
411    #[inline]
412    fn next_is_flow(&self) -> bool {
413        is_flow(self.peek())
414    }
415
416    /// Check whether the next character is [a digit].
417    ///
418    /// The character must have previously been fetched through [`lookahead`]
419    ///
420    /// # Return
421    /// Returns true if the character is [a digit], false otherwise.
422    ///
423    /// [`lookahead`]: Input::lookahead
424    /// [a digit]: is_digit
425    #[inline]
426    fn next_is_digit(&self) -> bool {
427        is_digit(self.peek())
428    }
429
430    /// Check whether the next character is [a letter].
431    ///
432    /// The character must have previously been fetched through [`lookahead`]
433    ///
434    /// # Return
435    /// Returns true if the character is [a letter], false otherwise.
436    ///
437    /// [`lookahead`]: Input::lookahead
438    /// [a letter]: is_alpha
439    #[inline]
440    fn next_is_alpha(&self) -> bool {
441        is_alpha(self.peek())
442    }
443
444    /// Skip characters from the input until a [breakz] is found.
445    ///
446    /// The characters are consumed from the input.
447    ///
448    /// # Return
449    /// Return the number of characters that were consumed. The number of characters returned can
450    /// be used to advance the index and column, since no end-of-line character will be consumed.
451    ///
452    /// [breakz]: is_breakz
453    #[inline]
454    fn skip_while_non_breakz(&mut self) -> usize {
455        let mut count = 0;
456        while !is_breakz(self.look_ch()) {
457            count += self.peek().len_utf8();
458            self.skip();
459        }
460        count
461    }
462
463    /// Skip characters from the input while [blanks] are found.
464    ///
465    /// The characters are consumed from the input.
466    ///
467    /// # Return
468    /// Return the number of characters that were consumed. The number of characters returned can
469    /// be used to advance the index and column, since no end-of-line character will be consumed.
470    ///
471    /// [blanks]: is_blank
472    fn skip_while_blank(&mut self) -> usize {
473        let mut n_bytes = 0;
474        while is_blank(self.look_ch()) {
475            n_bytes += self.peek().len_utf8();
476            self.skip();
477        }
478        n_bytes
479    }
480
481    /// Fetch characters from the input while we encounter letters and store them in `out`.
482    ///
483    /// The characters are consumed from the input.
484    ///
485    /// # Return
486    /// Return the number of characters that were consumed. The number of characters returned can
487    /// be used to advance the index and column, since no end-of-line character will be consumed.
488    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
489        let mut n_bytes = 0;
490        while is_alpha(self.look_ch()) {
491            let c = self.peek();
492            n_bytes += c.len_utf8();
493            out.push(c);
494            self.skip();
495        }
496        n_bytes
497    }
498
499    /// Fetch characters as long as they satisfy `is_yaml_non_space(c)`.
500    ///
501    /// The characters are consumed from the input.
502    ///
503    /// # Return
504    /// Return the number of characters that were consumed. The number of characters returned can
505    /// be used to advance the index and column, since no end-of-line character will be consumed.
506    fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
507        let mut n_bytes = 0;
508        while crate::char_traits::is_yaml_non_space(self.look_ch()) && !is_z(self.look_ch()) {
509            let c = self.peek();
510            n_bytes += c.len_utf8();
511            out.push(c);
512            self.skip();
513        }
514        n_bytes
515    }
516
517    /// Fetch a chunk of plain scalar characters.
518    ///
519    /// This optimization method allows the input to batch process characters.
520    /// Returns (stopped, `chars_consumed`).
521    /// stopped is true if the chunk ended because of a non-plain-scalar character.
522    fn fetch_plain_scalar_chunk(
523        &mut self,
524        out: &mut String,
525        count: usize,
526        flow_level_gt_0: bool,
527    ) -> (bool, usize) {
528        let mut chars_consumed = 0;
529        for _ in 0..count {
530            self.lookahead(1);
531            if self.next_is_blank_or_breakz() || !self.next_can_be_plain_scalar(flow_level_gt_0) {
532                return (true, chars_consumed);
533            }
534            out.push(self.peek());
535            self.skip();
536            chars_consumed += 1;
537        }
538        (false, chars_consumed)
539    }
540}
541
542/// Behavior to adopt regarding treating tabs as whitespace.
543///
544/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
545#[derive(Copy, Clone, Eq, PartialEq)]
546pub enum SkipTabs {
547    /// Skip all tabs as whitespace.
548    Yes,
549    /// Don't skip any tab. Return from the function when encountering one.
550    No,
551    /// Return value from the function.
552    Result(
553        /// Whether tabs were encountered.
554        bool,
555        /// Whether at least 1 valid yaml whitespace has been encountered.
556        bool,
557    ),
558}
559
560impl SkipTabs {
561    /// Whether tabs were found while skipping whitespace.
562    ///
563    /// This function must be called after a call to `skip_ws_to_eol`.
564    #[must_use]
565    pub fn found_tabs(self) -> bool {
566        matches!(self, SkipTabs::Result(true, _))
567    }
568
569    /// Whether a valid YAML whitespace has been found in skipped-over content.
570    ///
571    /// This function must be called after a call to `skip_ws_to_eol`.
572    #[must_use]
573    pub fn has_valid_yaml_ws(self) -> bool {
574        matches!(self, SkipTabs::Result(_, true))
575    }
576}