saphyr_parser_bw/input.rs
1//! Utilities to create a source of input to the parser.
2//!
3//! [`Input`] must be implemented for the parser to fetch input. Make sure your needs aren't
4//! covered by the [`BufferedInput`].
5
6use alloc::string::String;
7
8pub(crate) mod buffered;
9pub(crate) mod str;
10
11#[allow(clippy::module_name_repetitions)]
12pub use buffered::BufferedInput;
13
14/// A trait for inputs that can provide borrowed slices with a specific lifetime.
15///
16/// This trait enables zero-copy (`Cow::Borrowed`) token values for inputs that keep a stable
17/// backing string. The key difference from [`Input::slice_bytes`] is that this method returns
18/// a slice with the input's original lifetime `'a`, not tied to `&self`.
19///
20/// For inputs that support zero-copy (like [`str::StrInput`]), this returns `Some(&'a str)`.
21/// For streaming inputs that don't have stable backing storage, this returns `None`.
22pub trait BorrowedInput<'a>: Input {
23 /// Return a borrowed slice of the underlying source between two byte offsets.
24 ///
25 /// Unlike [`Input::slice_bytes`], this returns a slice with the input's lifetime `'a`,
26 /// allowing the slice to outlive the borrow of `&self`.
27 ///
28 /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
29 /// half-open: `[start, end)`.
30 ///
31 /// Returns `None` if the input does not support zero-copy slicing.
32 #[must_use]
33 fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str>;
34}
35
36pub use crate::char_traits::{
37 is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
38};
39
40/// Interface for a source of characters.
41///
42/// Hiding the input's implementation behind this trait allows mostly:
43/// * For input-specific optimizations (for instance, using `str` methods instead of manually
44/// transferring one `char` at a time to a buffer).
45/// * To return `&str`s referencing the input string, thus avoiding potentially costly
46/// allocations. Should users need an owned version of the data, they can always `.to_owned()`
47/// their YAML object.
48pub trait Input {
49 /// A hint to the input source that we will need to read `count` characters.
50 ///
51 /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
52 /// The characters must not be consumed, but may be placed in an internal buffer.
53 ///
54 /// This method may be a no-op if buffering yields no performance improvement.
55 ///
56 /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
57 /// parser tracks how many characters are loaded in the buffer and acts accordingly.
58 fn lookahead(&mut self, count: usize);
59
60 /// Return the number of buffered characters in `self`.
61 #[must_use]
62 fn buflen(&self) -> usize;
63
64 /// Return the capacity of the buffer in `self`.
65 #[must_use]
66 fn bufmaxlen(&self) -> usize;
67
68 /// Return whether the buffer (!= stream) is empty.
69 #[inline]
70 #[must_use]
71 fn buf_is_empty(&self) -> bool {
72 self.buflen() == 0
73 }
74
75 /// Read a character from the input stream and return it directly.
76 ///
77 /// The internal buffer (if any) is bypassed.
78 #[must_use]
79 fn raw_read_ch(&mut self) -> char;
80
81 /// Read a non-breakz a character from the input stream and return it directly.
82 ///
83 /// The internal buffer (if any) is bypassed.
84 ///
85 /// If the next character is a breakz, it is either not consumed or placed into the buffer (if
86 /// any).
87 #[must_use]
88 fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
89
90 /// Consume the next character.
91 fn skip(&mut self);
92
93 /// Consume the next `count` character.
94 fn skip_n(&mut self, count: usize);
95
96 /// Return the next character, without consuming it.
97 ///
98 /// Users of the [`Input`] must make sure that the character has been loaded through a prior
99 /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
100 /// [`Input::lookahead`] has been made beforehand.
101 ///
102 /// # Return
103 /// If the input source is not exhausted, returns the next character to be fed into the
104 /// scanner. Otherwise, returns `\0`.
105 #[must_use]
106 fn peek(&self) -> char;
107
108 /// Return the `n`-th character in the buffer, without consuming it.
109 ///
110 /// This function assumes that the n-th character in the input has already been fetched through
111 /// [`Input::lookahead`].
112 #[must_use]
113 fn peek_nth(&self, n: usize) -> char;
114
115 /// Return the current byte offset in the underlying source, if available.
116 ///
117 /// This is an *optional* capability that enables zero-copy (`Cow::Borrowed`) token values
118 /// for inputs that keep a stable backing string (notably [`str::StrInput`]).
119 ///
120 /// The returned value (when `Some`) is the number of bytes that have been consumed so far,
121 /// i.e. an offset into the original source string.
122 ///
123 /// # Correctness contract
124 /// Implementations returning `Some(_)` must satisfy all of the following:
125 ///
126 /// - The offset is a valid UTF-8 boundary in the underlying source.
127 /// - The offset is monotonically non-decreasing as characters are consumed.
128 /// - The underlying source is stable for the duration of parsing (no reallocation/mutation)
129 /// so that slices returned by [`Input::slice_bytes`] remain valid.
130 ///
131 /// Inputs that cannot provide stable slicing (e.g. stream/iterator inputs) must return
132 /// `None`.
133 #[inline]
134 #[must_use]
135 fn byte_offset(&self) -> Option<usize> {
136 None
137 }
138
139 /// Return a borrowed slice of the underlying source between two byte offsets.
140 ///
141 /// This is an *optional* capability used to produce `Cow::Borrowed` values without
142 /// allocating.
143 ///
144 /// `start` and `end` are byte offsets as returned by [`Input::byte_offset`]. The interval is
145 /// half-open: `[start, end)`.
146 ///
147 /// # Correctness contract
148 /// Implementations returning `Some(&str)` must ensure:
149 ///
150 /// - `start <= end`.
151 /// - Both offsets are valid UTF-8 boundaries.
152 /// - The returned `&str` is a view into the stable underlying source associated with this
153 /// input.
154 ///
155 /// Implementations that return `None` from [`Input::byte_offset`] must also return `None`
156 /// here.
157 #[inline]
158 #[must_use]
159 fn slice_bytes(&self, _start: usize, _end: usize) -> Option<&str> {
160 None
161 }
162
163 /// Look for the next character and return it.
164 ///
165 /// The character is not consumed.
166 /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
167 #[inline]
168 #[must_use]
169 fn look_ch(&mut self) -> char {
170 self.lookahead(1);
171 self.peek()
172 }
173
174 /// Return whether the next character in the input source is equal to `c`.
175 ///
176 /// This function assumes that the next character in the input has already been fetched through
177 /// [`Input::lookahead`].
178 #[inline]
179 #[must_use]
180 fn next_char_is(&self, c: char) -> bool {
181 self.peek() == c
182 }
183
184 /// Return whether the `n`-th character in the input source is equal to `c`.
185 ///
186 /// This function assumes that the n-th character in the input has already been fetched through
187 /// [`Input::lookahead`].
188 #[inline]
189 #[must_use]
190 fn nth_char_is(&self, n: usize, c: char) -> bool {
191 self.peek_nth(n) == c
192 }
193
194 /// Return whether the next 2 characters in the input source match the given characters.
195 ///
196 /// This function assumes that the next 2 characters in the input have already been fetched
197 /// through [`Input::lookahead`].
198 #[inline]
199 #[must_use]
200 fn next_2_are(&self, c1: char, c2: char) -> bool {
201 assert!(self.buflen() >= 2);
202 self.peek() == c1 && self.peek_nth(1) == c2
203 }
204
205 /// Return whether the next 3 characters in the input source match the given characters.
206 ///
207 /// This function assumes that the next 3 characters in the input have already been fetched
208 /// through [`Input::lookahead`].
209 #[inline]
210 #[must_use]
211 fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
212 assert!(self.buflen() >= 3);
213 self.peek() == c1 && self.peek_nth(1) == c2 && self.peek_nth(2) == c3
214 }
215
216 /// Check whether the next characters correspond to a document indicator.
217 ///
218 /// This function assumes that the next 4 characters in the input has already been fetched
219 /// through [`Input::lookahead`].
220 #[inline]
221 #[must_use]
222 fn next_is_document_indicator(&self) -> bool {
223 assert!(self.buflen() >= 4);
224 is_blank_or_breakz(self.peek_nth(3))
225 && (self.next_3_are('.', '.', '.') || self.next_3_are('-', '-', '-'))
226 }
227
228 /// Check whether the next characters correspond to a start of document.
229 ///
230 /// This function assumes that the next 4 characters in the input has already been fetched
231 /// through [`Input::lookahead`].
232 #[inline]
233 #[must_use]
234 fn next_is_document_start(&self) -> bool {
235 assert!(self.buflen() >= 4);
236 self.next_3_are('-', '-', '-') && is_blank_or_breakz(self.peek_nth(3))
237 }
238
239 /// Check whether the next characters correspond to an end of document.
240 ///
241 /// This function assumes that the next 4 characters in the input has already been fetched
242 /// through [`Input::lookahead`].
243 #[inline]
244 #[must_use]
245 fn next_is_document_end(&self) -> bool {
246 assert!(self.buflen() >= 4);
247 self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
248 }
249
250 /// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input.
251 ///
252 /// # Return
253 /// Return a tuple with the number of characters that were consumed and the result of skipping
254 /// whitespace. The number of characters returned can be used to advance the index and column,
255 /// since no end-of-line character will be consumed.
256 /// See [`SkipTabs`] For more details on the success variant.
257 ///
258 /// # Errors
259 /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
260 /// the first tuple element will contain the number of characters consumed prior to reaching
261 /// the `#`.
262 fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
263 let mut encountered_tab = false;
264 let mut has_yaml_ws = false;
265 let mut chars_consumed = 0;
266 loop {
267 match self.look_ch() {
268 ' ' => {
269 has_yaml_ws = true;
270 self.skip();
271 }
272 '\t' if skip_tabs != SkipTabs::No => {
273 encountered_tab = true;
274 self.skip();
275 }
276 // YAML comments must be preceded by whitespace.
277 '#' if !encountered_tab && !has_yaml_ws => {
278 return (
279 chars_consumed,
280 Err("comments must be separated from other tokens by whitespace"),
281 );
282 }
283 '#' => {
284 self.skip(); // Skip over '#'
285 while !is_breakz(self.look_ch()) {
286 self.skip();
287 chars_consumed += 1;
288 }
289 }
290 _ => break,
291 }
292 chars_consumed += 1;
293 }
294
295 (
296 chars_consumed,
297 Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
298 )
299 }
300
301 /// Check whether the next characters may be part of a plain scalar.
302 ///
303 /// This function assumes we are not given a blankz character.
304 #[allow(clippy::inline_always)]
305 #[inline(always)]
306 fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
307 let nc = self.peek_nth(1);
308 match self.peek() {
309 // indicators can end a plain scalar, see 7.3.3. Plain Style
310 ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
311 c if in_flow && is_flow(c) => false,
312 _ => true,
313 }
314 }
315
316 /// Check whether the next character is [a blank] or [a break].
317 ///
318 /// The character must have previously been fetched through [`lookahead`]
319 ///
320 /// # Return
321 /// Returns true if the character is [a blank] or [a break], false otherwise.
322 ///
323 /// [`lookahead`]: Input::lookahead
324 /// [a blank]: is_blank
325 /// [a break]: is_break
326 #[inline]
327 fn next_is_blank_or_break(&self) -> bool {
328 is_blank(self.peek()) || is_break(self.peek())
329 }
330
331 /// Check whether the next character is [a blank] or [a breakz].
332 ///
333 /// The character must have previously been fetched through [`lookahead`]
334 ///
335 /// # Return
336 /// Returns true if the character is [a blank] or [a break], false otherwise.
337 ///
338 /// [`lookahead`]: Input::lookahead
339 /// [a blank]: is_blank
340 /// [a breakz]: is_breakz
341 #[inline]
342 fn next_is_blank_or_breakz(&self) -> bool {
343 is_blank(self.peek()) || is_breakz(self.peek())
344 }
345
346 /// Check whether the next character is [a blank].
347 ///
348 /// The character must have previously been fetched through [`lookahead`]
349 ///
350 /// # Return
351 /// Returns true if the character is [a blank], false otherwise.
352 ///
353 /// [`lookahead`]: Input::lookahead
354 /// [a blank]: is_blank
355 #[inline]
356 fn next_is_blank(&self) -> bool {
357 is_blank(self.peek())
358 }
359
360 /// Check whether the next character is [a break].
361 ///
362 /// The character must have previously been fetched through [`lookahead`]
363 ///
364 /// # Return
365 /// Returns true if the character is [a break], false otherwise.
366 ///
367 /// [`lookahead`]: Input::lookahead
368 /// [a break]: is_break
369 #[inline]
370 fn next_is_break(&self) -> bool {
371 is_break(self.peek())
372 }
373
374 /// Check whether the next character is [a breakz].
375 ///
376 /// The character must have previously been fetched through [`lookahead`]
377 ///
378 /// # Return
379 /// Returns true if the character is [a breakz], false otherwise.
380 ///
381 /// [`lookahead`]: Input::lookahead
382 /// [a breakz]: is_breakz
383 #[inline]
384 fn next_is_breakz(&self) -> bool {
385 is_breakz(self.peek())
386 }
387
388 /// Check whether the next character is [a z].
389 ///
390 /// The character must have previously been fetched through [`lookahead`]
391 ///
392 /// # Return
393 /// Returns true if the character is [a z], false otherwise.
394 ///
395 /// [`lookahead`]: Input::lookahead
396 /// [a z]: is_z
397 #[inline]
398 fn next_is_z(&self) -> bool {
399 is_z(self.peek())
400 }
401
402 /// Check whether the next character is [a flow].
403 ///
404 /// The character must have previously been fetched through [`lookahead`]
405 ///
406 /// # Return
407 /// Returns true if the character is [a flow], false otherwise.
408 ///
409 /// [`lookahead`]: Input::lookahead
410 /// [a flow]: is_flow
411 #[inline]
412 fn next_is_flow(&self) -> bool {
413 is_flow(self.peek())
414 }
415
416 /// Check whether the next character is [a digit].
417 ///
418 /// The character must have previously been fetched through [`lookahead`]
419 ///
420 /// # Return
421 /// Returns true if the character is [a digit], false otherwise.
422 ///
423 /// [`lookahead`]: Input::lookahead
424 /// [a digit]: is_digit
425 #[inline]
426 fn next_is_digit(&self) -> bool {
427 is_digit(self.peek())
428 }
429
430 /// Check whether the next character is [a letter].
431 ///
432 /// The character must have previously been fetched through [`lookahead`]
433 ///
434 /// # Return
435 /// Returns true if the character is [a letter], false otherwise.
436 ///
437 /// [`lookahead`]: Input::lookahead
438 /// [a letter]: is_alpha
439 #[inline]
440 fn next_is_alpha(&self) -> bool {
441 is_alpha(self.peek())
442 }
443
444 /// Skip characters from the input until a [breakz] is found.
445 ///
446 /// The characters are consumed from the input.
447 ///
448 /// # Return
449 /// Return the number of characters that were consumed. The number of characters returned can
450 /// be used to advance the index and column, since no end-of-line character will be consumed.
451 ///
452 /// [breakz]: is_breakz
453 #[inline]
454 fn skip_while_non_breakz(&mut self) -> usize {
455 let mut count = 0;
456 while !is_breakz(self.look_ch()) {
457 count += self.peek().len_utf8();
458 self.skip();
459 }
460 count
461 }
462
463 /// Skip characters from the input while [blanks] are found.
464 ///
465 /// The characters are consumed from the input.
466 ///
467 /// # Return
468 /// Return the number of characters that were consumed. The number of characters returned can
469 /// be used to advance the index and column, since no end-of-line character will be consumed.
470 ///
471 /// [blanks]: is_blank
472 fn skip_while_blank(&mut self) -> usize {
473 let mut n_bytes = 0;
474 while is_blank(self.look_ch()) {
475 n_bytes += self.peek().len_utf8();
476 self.skip();
477 }
478 n_bytes
479 }
480
481 /// Fetch characters from the input while we encounter letters and store them in `out`.
482 ///
483 /// The characters are consumed from the input.
484 ///
485 /// # Return
486 /// Return the number of characters that were consumed. The number of characters returned can
487 /// be used to advance the index and column, since no end-of-line character will be consumed.
488 fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
489 let mut n_bytes = 0;
490 while is_alpha(self.look_ch()) {
491 let c = self.peek();
492 n_bytes += c.len_utf8();
493 out.push(c);
494 self.skip();
495 }
496 n_bytes
497 }
498
499 /// Fetch characters as long as they satisfy `is_yaml_non_space(c)`.
500 ///
501 /// The characters are consumed from the input.
502 ///
503 /// # Return
504 /// Return the number of characters that were consumed. The number of characters returned can
505 /// be used to advance the index and column, since no end-of-line character will be consumed.
506 fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
507 let mut n_bytes = 0;
508 while crate::char_traits::is_yaml_non_space(self.look_ch()) && !is_z(self.look_ch()) {
509 let c = self.peek();
510 n_bytes += c.len_utf8();
511 out.push(c);
512 self.skip();
513 }
514 n_bytes
515 }
516
517 /// Fetch a chunk of plain scalar characters.
518 ///
519 /// This optimization method allows the input to batch process characters.
520 /// Returns (stopped, `chars_consumed`).
521 /// stopped is true if the chunk ended because of a non-plain-scalar character.
522 fn fetch_plain_scalar_chunk(
523 &mut self,
524 out: &mut String,
525 count: usize,
526 flow_level_gt_0: bool,
527 ) -> (bool, usize) {
528 let mut chars_consumed = 0;
529 for _ in 0..count {
530 self.lookahead(1);
531 if self.next_is_blank_or_breakz() || !self.next_can_be_plain_scalar(flow_level_gt_0) {
532 return (true, chars_consumed);
533 }
534 out.push(self.peek());
535 self.skip();
536 chars_consumed += 1;
537 }
538 (false, chars_consumed)
539 }
540}
541
542/// Behavior to adopt regarding treating tabs as whitespace.
543///
544/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
545#[derive(Copy, Clone, Eq, PartialEq)]
546pub enum SkipTabs {
547 /// Skip all tabs as whitespace.
548 Yes,
549 /// Don't skip any tab. Return from the function when encountering one.
550 No,
551 /// Return value from the function.
552 Result(
553 /// Whether tabs were encountered.
554 bool,
555 /// Whether at least 1 valid yaml whitespace has been encountered.
556 bool,
557 ),
558}
559
560impl SkipTabs {
561 /// Whether tabs were found while skipping whitespace.
562 ///
563 /// This function must be called after a call to `skip_ws_to_eol`.
564 #[must_use]
565 pub fn found_tabs(self) -> bool {
566 matches!(self, SkipTabs::Result(true, _))
567 }
568
569 /// Whether a valid YAML whitespace has been found in skipped-over content.
570 ///
571 /// This function must be called after a call to `skip_ws_to_eol`.
572 #[must_use]
573 pub fn has_valid_yaml_ws(self) -> bool {
574 matches!(self, SkipTabs::Result(_, true))
575 }
576}