saphyr_parser/
input.rs

1//! Utilities to create a source of input to the parser.
2//!
3//! [`Input`] must be implemented for the parser to fetch input. Make sure your needs aren't
4//! covered by the [`BufferedInput`].
5
6pub(crate) mod buffered;
7pub(crate) mod str;
8
9#[allow(clippy::module_name_repetitions)]
10pub use buffered::BufferedInput;
11
12pub use crate::char_traits::{
13    is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
14};
15
16/// Interface for a source of characters.
17///
18/// Hiding the input's implementation behind this trait allows mostly:
19///  * For input-specific optimizations (for instance, using `str` methods instead of manually
20///    transferring one `char` at a time to a buffer).
21///  * To return `&str`s referencing the input string, thus avoiding potentially costly
22///    allocations. Should users need an owned version of the data, they can always `.to_owned()`
23///    their YAML object.
24pub trait Input {
25    /// A hint to the input source that we will need to read `count` characters.
26    ///
27    /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
28    /// The characters must not be consumed, but may be placed in an internal buffer.
29    ///
30    /// This method may be a no-op if buffering yields no performance improvement.
31    ///
32    /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
33    /// parser tracks how many characters are loaded in the buffer and acts accordingly.
34    fn lookahead(&mut self, count: usize);
35
36    /// Return the number of buffered characters in `self`.
37    #[must_use]
38    fn buflen(&self) -> usize;
39
40    /// Return the capacity of the buffer in `self`.
41    #[must_use]
42    fn bufmaxlen(&self) -> usize;
43
44    /// Return whether the buffer (!= stream) is empty.
45    #[inline]
46    #[must_use]
47    fn buf_is_empty(&self) -> bool {
48        self.buflen() == 0
49    }
50
51    /// Read a character from the input stream and return it directly.
52    ///
53    /// The internal buffer (if any) is bypassed.
54    #[must_use]
55    fn raw_read_ch(&mut self) -> char;
56
57    /// Read a non-breakz a character from the input stream and return it directly.
58    ///
59    /// The internal buffer (if any) is bypassed.
60    ///
61    /// If the next character is a breakz, it is either not consumed or placed into the buffer (if
62    /// any).
63    #[must_use]
64    fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
65
66    /// Consume the next character.
67    fn skip(&mut self);
68
69    /// Consume the next `count` character.
70    fn skip_n(&mut self, count: usize);
71
72    /// Return the next character, without consuming it.
73    ///
74    /// Users of the [`Input`] must make sure that the character has been loaded through a prior
75    /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
76    /// [`Input::lookahead`] has been made beforehand.
77    ///
78    /// # Return
79    /// If the input source is not exhausted, returns the next character to be fed into the
80    /// scanner. Otherwise, returns `\0`.
81    #[must_use]
82    fn peek(&self) -> char;
83
84    /// Return the `n`-th character in the buffer, without consuming it.
85    ///
86    /// This function assumes that the n-th character in the input has already been fetched through
87    /// [`Input::lookahead`].
88    #[must_use]
89    fn peek_nth(&self, n: usize) -> char;
90
91    /// Look for the next character and return it.
92    ///
93    /// The character is not consumed.
94    /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
95    #[inline]
96    #[must_use]
97    fn look_ch(&mut self) -> char {
98        self.lookahead(1);
99        self.peek()
100    }
101
102    /// Return whether the next character in the input source is equal to `c`.
103    ///
104    /// This function assumes that the next character in the input has already been fetched through
105    /// [`Input::lookahead`].
106    #[inline]
107    #[must_use]
108    fn next_char_is(&self, c: char) -> bool {
109        self.peek() == c
110    }
111
112    /// Return whether the `n`-th character in the input source is equal to `c`.
113    ///
114    /// This function assumes that the n-th character in the input has already been fetched through
115    /// [`Input::lookahead`].
116    #[inline]
117    #[must_use]
118    fn nth_char_is(&self, n: usize, c: char) -> bool {
119        self.peek_nth(n) == c
120    }
121
122    /// Return whether the next 2 characters in the input source match the given characters.
123    ///
124    /// This function assumes that the next 2 characters in the input have already been fetched
125    /// through [`Input::lookahead`].
126    #[inline]
127    #[must_use]
128    fn next_2_are(&self, c1: char, c2: char) -> bool {
129        assert!(self.buflen() >= 2);
130        self.peek() == c1 && self.peek_nth(1) == c2
131    }
132
133    /// Return whether the next 3 characters in the input source match the given characters.
134    ///
135    /// This function assumes that the next 3 characters in the input have already been fetched
136    /// through [`Input::lookahead`].
137    #[inline]
138    #[must_use]
139    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
140        assert!(self.buflen() >= 3);
141        self.peek() == c1 && self.peek_nth(1) == c2 && self.peek_nth(2) == c3
142    }
143
144    /// Check whether the next characters correspond to a document indicator.
145    ///
146    /// This function assumes that the next 4 characters in the input has already been fetched
147    /// through [`Input::lookahead`].
148    #[inline]
149    #[must_use]
150    fn next_is_document_indicator(&self) -> bool {
151        assert!(self.buflen() >= 4);
152        is_blank_or_breakz(self.peek_nth(3))
153            && (self.next_3_are('.', '.', '.') || self.next_3_are('-', '-', '-'))
154    }
155
156    /// Check whether the next characters correspond to a start of document.
157    ///
158    /// This function assumes that the next 4 characters in the input has already been fetched
159    /// through [`Input::lookahead`].
160    #[inline]
161    #[must_use]
162    fn next_is_document_start(&self) -> bool {
163        assert!(self.buflen() >= 4);
164        self.next_3_are('-', '-', '-') && is_blank_or_breakz(self.peek_nth(3))
165    }
166
167    /// Check whether the next characters correspond to an end of document.
168    ///
169    /// This function assumes that the next 4 characters in the input has already been fetched
170    /// through [`Input::lookahead`].
171    #[inline]
172    #[must_use]
173    fn next_is_document_end(&self) -> bool {
174        assert!(self.buflen() >= 4);
175        self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
176    }
177
178    /// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input.
179    ///
180    /// # Return
181    /// Return a tuple with the number of characters that were consumed and the result of skipping
182    /// whitespace. The number of characters returned can be used to advance the index and column,
183    /// since no end-of-line character will be consumed.
184    /// See [`SkipTabs`] For more details on the success variant.
185    ///
186    /// # Errors
187    /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
188    /// the first tuple element will contain the number of characters consumed prior to reaching
189    /// the `#`.
190    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
191        let mut encountered_tab = false;
192        let mut has_yaml_ws = false;
193        let mut chars_consumed = 0;
194        loop {
195            match self.look_ch() {
196                ' ' => {
197                    has_yaml_ws = true;
198                    self.skip();
199                }
200                '\t' if skip_tabs != SkipTabs::No => {
201                    encountered_tab = true;
202                    self.skip();
203                }
204                // YAML comments must be preceded by whitespace.
205                '#' if !encountered_tab && !has_yaml_ws => {
206                    return (
207                        chars_consumed,
208                        Err("comments must be separated from other tokens by whitespace"),
209                    );
210                }
211                '#' => {
212                    self.skip(); // Skip over '#'
213                    while !is_breakz(self.look_ch()) {
214                        self.skip();
215                        chars_consumed += 1;
216                    }
217                }
218                _ => break,
219            }
220            chars_consumed += 1;
221        }
222
223        (
224            chars_consumed,
225            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
226        )
227    }
228
229    /// Check whether the next characters may be part of a plain scalar.
230    ///
231    /// This function assumes we are not given a blankz character.
232    #[allow(clippy::inline_always)]
233    #[inline(always)]
234    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
235        let nc = self.peek_nth(1);
236        match self.peek() {
237            // indicators can end a plain scalar, see 7.3.3. Plain Style
238            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
239            c if in_flow && is_flow(c) => false,
240            _ => true,
241        }
242    }
243
244    /// Check whether the next character is [a blank] or [a break].
245    ///
246    /// The character must have previously been fetched through [`lookahead`]
247    ///
248    /// # Return
249    /// Returns true if the character is [a blank] or [a break], false otherwise.
250    ///
251    /// [`lookahead`]: Input::lookahead
252    /// [a blank]: is_blank
253    /// [a break]: is_break
254    #[inline]
255    fn next_is_blank_or_break(&self) -> bool {
256        is_blank(self.peek()) || is_break(self.peek())
257    }
258
259    /// Check whether the next character is [a blank] or [a breakz].
260    ///
261    /// The character must have previously been fetched through [`lookahead`]
262    ///
263    /// # Return
264    /// Returns true if the character is [a blank] or [a break], false otherwise.
265    ///
266    /// [`lookahead`]: Input::lookahead
267    /// [a blank]: is_blank
268    /// [a breakz]: is_breakz
269    #[inline]
270    fn next_is_blank_or_breakz(&self) -> bool {
271        is_blank(self.peek()) || is_breakz(self.peek())
272    }
273
274    /// Check whether the next character is [a blank].
275    ///
276    /// The character must have previously been fetched through [`lookahead`]
277    ///
278    /// # Return
279    /// Returns true if the character is [a blank], false otherwise.
280    ///
281    /// [`lookahead`]: Input::lookahead
282    /// [a blank]: is_blank
283    #[inline]
284    fn next_is_blank(&self) -> bool {
285        is_blank(self.peek())
286    }
287
288    /// Check whether the next character is [a break].
289    ///
290    /// The character must have previously been fetched through [`lookahead`]
291    ///
292    /// # Return
293    /// Returns true if the character is [a break], false otherwise.
294    ///
295    /// [`lookahead`]: Input::lookahead
296    /// [a break]: is_break
297    #[inline]
298    fn next_is_break(&self) -> bool {
299        is_break(self.peek())
300    }
301
302    /// Check whether the next character is [a breakz].
303    ///
304    /// The character must have previously been fetched through [`lookahead`]
305    ///
306    /// # Return
307    /// Returns true if the character is [a breakz], false otherwise.
308    ///
309    /// [`lookahead`]: Input::lookahead
310    /// [a breakz]: is_breakz
311    #[inline]
312    fn next_is_breakz(&self) -> bool {
313        is_breakz(self.peek())
314    }
315
316    /// Check whether the next character is [a z].
317    ///
318    /// The character must have previously been fetched through [`lookahead`]
319    ///
320    /// # Return
321    /// Returns true if the character is [a z], false otherwise.
322    ///
323    /// [`lookahead`]: Input::lookahead
324    /// [a z]: is_z
325    #[inline]
326    fn next_is_z(&self) -> bool {
327        is_z(self.peek())
328    }
329
330    /// Check whether the next character is [a flow].
331    ///
332    /// The character must have previously been fetched through [`lookahead`]
333    ///
334    /// # Return
335    /// Returns true if the character is [a flow], false otherwise.
336    ///
337    /// [`lookahead`]: Input::lookahead
338    /// [a flow]: is_flow
339    #[inline]
340    fn next_is_flow(&self) -> bool {
341        is_flow(self.peek())
342    }
343
344    /// Check whether the next character is [a digit].
345    ///
346    /// The character must have previously been fetched through [`lookahead`]
347    ///
348    /// # Return
349    /// Returns true if the character is [a digit], false otherwise.
350    ///
351    /// [`lookahead`]: Input::lookahead
352    /// [a digit]: is_digit
353    #[inline]
354    fn next_is_digit(&self) -> bool {
355        is_digit(self.peek())
356    }
357
358    /// Check whether the next character is [a letter].
359    ///
360    /// The character must have previously been fetched through [`lookahead`]
361    ///
362    /// # Return
363    /// Returns true if the character is [a letter], false otherwise.
364    ///
365    /// [`lookahead`]: Input::lookahead
366    /// [a letter]: is_alpha
367    #[inline]
368    fn next_is_alpha(&self) -> bool {
369        is_alpha(self.peek())
370    }
371
372    /// Skip characters from the input until a [breakz] is found.
373    ///
374    /// The characters are consumed from the input.
375    ///
376    /// # Return
377    /// Return the number of characters that were consumed. The number of characters returned can
378    /// be used to advance the index and column, since no end-of-line character will be consumed.
379    ///
380    /// [breakz]: is_breakz
381    #[inline]
382    fn skip_while_non_breakz(&mut self) -> usize {
383        let mut count = 0;
384        while !is_breakz(self.look_ch()) {
385            count += 1;
386            self.skip();
387        }
388        count
389    }
390
391    /// Skip characters from the input while [blanks] are found.
392    ///
393    /// The characters are consumed from the input.
394    ///
395    /// # Return
396    /// Return the number of characters that were consumed. The number of characters returned can
397    /// be used to advance the index and column, since no end-of-line character will be consumed.
398    ///
399    /// [blanks]: is_blank
400    fn skip_while_blank(&mut self) -> usize {
401        let mut n_chars = 0;
402        while is_blank(self.look_ch()) {
403            n_chars += 1;
404            self.skip();
405        }
406        n_chars
407    }
408
409    /// Fetch characters from the input while we encounter letters and store them in `out`.
410    ///
411    /// The characters are consumed from the input.
412    ///
413    /// # Return
414    /// Return the number of characters that were consumed. The number of characters returned can
415    /// be used to advance the index and column, since no end-of-line character will be consumed.
416    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
417        let mut n_chars = 0;
418        while is_alpha(self.look_ch()) {
419            n_chars += 1;
420            out.push(self.peek());
421            self.skip();
422        }
423        n_chars
424    }
425}
426
427/// Behavior to adopt regarding treating tabs as whitespace.
428///
429/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
430#[derive(Copy, Clone, Eq, PartialEq)]
431pub enum SkipTabs {
432    /// Skip all tabs as whitespace.
433    Yes,
434    /// Don't skip any tab. Return from the function when encountering one.
435    No,
436    /// Return value from the function.
437    Result(
438        /// Whether tabs were encountered.
439        bool,
440        /// Whether at least 1 valid yaml whitespace has been encountered.
441        bool,
442    ),
443}
444
445impl SkipTabs {
446    /// Whether tabs were found while skipping whitespace.
447    ///
448    /// This function must be called after a call to `skip_ws_to_eol`.
449    #[must_use]
450    pub fn found_tabs(self) -> bool {
451        matches!(self, SkipTabs::Result(true, _))
452    }
453
454    /// Whether a valid YAML whitespace has been found in skipped-over content.
455    ///
456    /// This function must be called after a call to `skip_ws_to_eol`.
457    #[must_use]
458    pub fn has_valid_yaml_ws(self) -> bool {
459        matches!(self, SkipTabs::Result(_, true))
460    }
461}