saphyr_parser/input.rs
1//! Utilities to create a source of input to the parser.
2//!
3//! [`Input`] must be implemented for the parser to fetch input. Make sure your needs aren't
4//! covered by the [`BufferedInput`].
5
6pub(crate) mod buffered;
7pub(crate) mod str;
8
9#[allow(clippy::module_name_repetitions)]
10pub use buffered::BufferedInput;
11
12pub use crate::char_traits::{
13 is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
14};
15
16/// Interface for a source of characters.
17///
18/// Hiding the input's implementation behind this trait allows mostly:
19/// * For input-specific optimizations (for instance, using `str` methods instead of manually
20/// transferring one `char` at a time to a buffer).
21/// * To return `&str`s referencing the input string, thus avoiding potentially costly
22/// allocations. Should users need an owned version of the data, they can always `.to_owned()`
23/// their YAML object.
24pub trait Input {
25 /// A hint to the input source that we will need to read `count` characters.
26 ///
27 /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
28 /// The characters must not be consumed, but may be placed in an internal buffer.
29 ///
30 /// This method may be a no-op if buffering yields no performance improvement.
31 ///
32 /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
33 /// parser tracks how many characters are loaded in the buffer and acts accordingly.
34 fn lookahead(&mut self, count: usize);
35
36 /// Return the number of buffered characters in `self`.
37 #[must_use]
38 fn buflen(&self) -> usize;
39
40 /// Return the capacity of the buffer in `self`.
41 #[must_use]
42 fn bufmaxlen(&self) -> usize;
43
44 /// Return whether the buffer (!= stream) is empty.
45 #[inline]
46 #[must_use]
47 fn buf_is_empty(&self) -> bool {
48 self.buflen() == 0
49 }
50
51 /// Read a character from the input stream and return it directly.
52 ///
53 /// The internal buffer (if any) is bypassed.
54 #[must_use]
55 fn raw_read_ch(&mut self) -> char;
56
57 /// Read a non-breakz a character from the input stream and return it directly.
58 ///
59 /// The internal buffer (if any) is bypassed.
60 ///
61 /// If the next character is a breakz, it is either not consumed or placed into the buffer (if
62 /// any).
63 #[must_use]
64 fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
65
66 /// Consume the next character.
67 fn skip(&mut self);
68
69 /// Consume the next `count` character.
70 fn skip_n(&mut self, count: usize);
71
72 /// Return the next character, without consuming it.
73 ///
74 /// Users of the [`Input`] must make sure that the character has been loaded through a prior
75 /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
76 /// [`Input::lookahead`] has been made beforehand.
77 ///
78 /// # Return
79 /// If the input source is not exhausted, returns the next character to be fed into the
80 /// scanner. Otherwise, returns `\0`.
81 #[must_use]
82 fn peek(&self) -> char;
83
84 /// Return the `n`-th character in the buffer, without consuming it.
85 ///
86 /// This function assumes that the n-th character in the input has already been fetched through
87 /// [`Input::lookahead`].
88 #[must_use]
89 fn peek_nth(&self, n: usize) -> char;
90
91 /// Look for the next character and return it.
92 ///
93 /// The character is not consumed.
94 /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
95 #[inline]
96 #[must_use]
97 fn look_ch(&mut self) -> char {
98 self.lookahead(1);
99 self.peek()
100 }
101
102 /// Return whether the next character in the input source is equal to `c`.
103 ///
104 /// This function assumes that the next character in the input has already been fetched through
105 /// [`Input::lookahead`].
106 #[inline]
107 #[must_use]
108 fn next_char_is(&self, c: char) -> bool {
109 self.peek() == c
110 }
111
112 /// Return whether the `n`-th character in the input source is equal to `c`.
113 ///
114 /// This function assumes that the n-th character in the input has already been fetched through
115 /// [`Input::lookahead`].
116 #[inline]
117 #[must_use]
118 fn nth_char_is(&self, n: usize, c: char) -> bool {
119 self.peek_nth(n) == c
120 }
121
122 /// Return whether the next 2 characters in the input source match the given characters.
123 ///
124 /// This function assumes that the next 2 characters in the input have already been fetched
125 /// through [`Input::lookahead`].
126 #[inline]
127 #[must_use]
128 fn next_2_are(&self, c1: char, c2: char) -> bool {
129 assert!(self.buflen() >= 2);
130 self.peek() == c1 && self.peek_nth(1) == c2
131 }
132
133 /// Return whether the next 3 characters in the input source match the given characters.
134 ///
135 /// This function assumes that the next 3 characters in the input have already been fetched
136 /// through [`Input::lookahead`].
137 #[inline]
138 #[must_use]
139 fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
140 assert!(self.buflen() >= 3);
141 self.peek() == c1 && self.peek_nth(1) == c2 && self.peek_nth(2) == c3
142 }
143
144 /// Check whether the next characters correspond to a document indicator.
145 ///
146 /// This function assumes that the next 4 characters in the input has already been fetched
147 /// through [`Input::lookahead`].
148 #[inline]
149 #[must_use]
150 fn next_is_document_indicator(&self) -> bool {
151 assert!(self.buflen() >= 4);
152 is_blank_or_breakz(self.peek_nth(3))
153 && (self.next_3_are('.', '.', '.') || self.next_3_are('-', '-', '-'))
154 }
155
156 /// Check whether the next characters correspond to a start of document.
157 ///
158 /// This function assumes that the next 4 characters in the input has already been fetched
159 /// through [`Input::lookahead`].
160 #[inline]
161 #[must_use]
162 fn next_is_document_start(&self) -> bool {
163 assert!(self.buflen() >= 4);
164 self.next_3_are('-', '-', '-') && is_blank_or_breakz(self.peek_nth(3))
165 }
166
167 /// Check whether the next characters correspond to an end of document.
168 ///
169 /// This function assumes that the next 4 characters in the input has already been fetched
170 /// through [`Input::lookahead`].
171 #[inline]
172 #[must_use]
173 fn next_is_document_end(&self) -> bool {
174 assert!(self.buflen() >= 4);
175 self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
176 }
177
178 /// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input.
179 ///
180 /// # Return
181 /// Return a tuple with the number of characters that were consumed and the result of skipping
182 /// whitespace. The number of characters returned can be used to advance the index and column,
183 /// since no end-of-line character will be consumed.
184 /// See [`SkipTabs`] For more details on the success variant.
185 ///
186 /// # Errors
187 /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
188 /// the first tuple element will contain the number of characters consumed prior to reaching
189 /// the `#`.
190 fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
191 let mut encountered_tab = false;
192 let mut has_yaml_ws = false;
193 let mut chars_consumed = 0;
194 loop {
195 match self.look_ch() {
196 ' ' => {
197 has_yaml_ws = true;
198 self.skip();
199 }
200 '\t' if skip_tabs != SkipTabs::No => {
201 encountered_tab = true;
202 self.skip();
203 }
204 // YAML comments must be preceded by whitespace.
205 '#' if !encountered_tab && !has_yaml_ws => {
206 return (
207 chars_consumed,
208 Err("comments must be separated from other tokens by whitespace"),
209 );
210 }
211 '#' => {
212 self.skip(); // Skip over '#'
213 while !is_breakz(self.look_ch()) {
214 self.skip();
215 chars_consumed += 1;
216 }
217 }
218 _ => break,
219 }
220 chars_consumed += 1;
221 }
222
223 (
224 chars_consumed,
225 Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
226 )
227 }
228
229 /// Check whether the next characters may be part of a plain scalar.
230 ///
231 /// This function assumes we are not given a blankz character.
232 #[allow(clippy::inline_always)]
233 #[inline(always)]
234 fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
235 let nc = self.peek_nth(1);
236 match self.peek() {
237 // indicators can end a plain scalar, see 7.3.3. Plain Style
238 ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
239 c if in_flow && is_flow(c) => false,
240 _ => true,
241 }
242 }
243
244 /// Check whether the next character is [a blank] or [a break].
245 ///
246 /// The character must have previously been fetched through [`lookahead`]
247 ///
248 /// # Return
249 /// Returns true if the character is [a blank] or [a break], false otherwise.
250 ///
251 /// [`lookahead`]: Input::lookahead
252 /// [a blank]: is_blank
253 /// [a break]: is_break
254 #[inline]
255 fn next_is_blank_or_break(&self) -> bool {
256 is_blank(self.peek()) || is_break(self.peek())
257 }
258
259 /// Check whether the next character is [a blank] or [a breakz].
260 ///
261 /// The character must have previously been fetched through [`lookahead`]
262 ///
263 /// # Return
264 /// Returns true if the character is [a blank] or [a break], false otherwise.
265 ///
266 /// [`lookahead`]: Input::lookahead
267 /// [a blank]: is_blank
268 /// [a breakz]: is_breakz
269 #[inline]
270 fn next_is_blank_or_breakz(&self) -> bool {
271 is_blank(self.peek()) || is_breakz(self.peek())
272 }
273
274 /// Check whether the next character is [a blank].
275 ///
276 /// The character must have previously been fetched through [`lookahead`]
277 ///
278 /// # Return
279 /// Returns true if the character is [a blank], false otherwise.
280 ///
281 /// [`lookahead`]: Input::lookahead
282 /// [a blank]: is_blank
283 #[inline]
284 fn next_is_blank(&self) -> bool {
285 is_blank(self.peek())
286 }
287
288 /// Check whether the next character is [a break].
289 ///
290 /// The character must have previously been fetched through [`lookahead`]
291 ///
292 /// # Return
293 /// Returns true if the character is [a break], false otherwise.
294 ///
295 /// [`lookahead`]: Input::lookahead
296 /// [a break]: is_break
297 #[inline]
298 fn next_is_break(&self) -> bool {
299 is_break(self.peek())
300 }
301
302 /// Check whether the next character is [a breakz].
303 ///
304 /// The character must have previously been fetched through [`lookahead`]
305 ///
306 /// # Return
307 /// Returns true if the character is [a breakz], false otherwise.
308 ///
309 /// [`lookahead`]: Input::lookahead
310 /// [a breakz]: is_breakz
311 #[inline]
312 fn next_is_breakz(&self) -> bool {
313 is_breakz(self.peek())
314 }
315
316 /// Check whether the next character is [a z].
317 ///
318 /// The character must have previously been fetched through [`lookahead`]
319 ///
320 /// # Return
321 /// Returns true if the character is [a z], false otherwise.
322 ///
323 /// [`lookahead`]: Input::lookahead
324 /// [a z]: is_z
325 #[inline]
326 fn next_is_z(&self) -> bool {
327 is_z(self.peek())
328 }
329
330 /// Check whether the next character is [a flow].
331 ///
332 /// The character must have previously been fetched through [`lookahead`]
333 ///
334 /// # Return
335 /// Returns true if the character is [a flow], false otherwise.
336 ///
337 /// [`lookahead`]: Input::lookahead
338 /// [a flow]: is_flow
339 #[inline]
340 fn next_is_flow(&self) -> bool {
341 is_flow(self.peek())
342 }
343
344 /// Check whether the next character is [a digit].
345 ///
346 /// The character must have previously been fetched through [`lookahead`]
347 ///
348 /// # Return
349 /// Returns true if the character is [a digit], false otherwise.
350 ///
351 /// [`lookahead`]: Input::lookahead
352 /// [a digit]: is_digit
353 #[inline]
354 fn next_is_digit(&self) -> bool {
355 is_digit(self.peek())
356 }
357
358 /// Check whether the next character is [a letter].
359 ///
360 /// The character must have previously been fetched through [`lookahead`]
361 ///
362 /// # Return
363 /// Returns true if the character is [a letter], false otherwise.
364 ///
365 /// [`lookahead`]: Input::lookahead
366 /// [a letter]: is_alpha
367 #[inline]
368 fn next_is_alpha(&self) -> bool {
369 is_alpha(self.peek())
370 }
371
372 /// Skip characters from the input until a [breakz] is found.
373 ///
374 /// The characters are consumed from the input.
375 ///
376 /// # Return
377 /// Return the number of characters that were consumed. The number of characters returned can
378 /// be used to advance the index and column, since no end-of-line character will be consumed.
379 ///
380 /// [breakz]: is_breakz
381 #[inline]
382 fn skip_while_non_breakz(&mut self) -> usize {
383 let mut count = 0;
384 while !is_breakz(self.look_ch()) {
385 count += 1;
386 self.skip();
387 }
388 count
389 }
390
391 /// Skip characters from the input while [blanks] are found.
392 ///
393 /// The characters are consumed from the input.
394 ///
395 /// # Return
396 /// Return the number of characters that were consumed. The number of characters returned can
397 /// be used to advance the index and column, since no end-of-line character will be consumed.
398 ///
399 /// [blanks]: is_blank
400 fn skip_while_blank(&mut self) -> usize {
401 let mut n_chars = 0;
402 while is_blank(self.look_ch()) {
403 n_chars += 1;
404 self.skip();
405 }
406 n_chars
407 }
408
409 /// Fetch characters from the input while we encounter letters and store them in `out`.
410 ///
411 /// The characters are consumed from the input.
412 ///
413 /// # Return
414 /// Return the number of characters that were consumed. The number of characters returned can
415 /// be used to advance the index and column, since no end-of-line character will be consumed.
416 fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
417 let mut n_chars = 0;
418 while is_alpha(self.look_ch()) {
419 n_chars += 1;
420 out.push(self.peek());
421 self.skip();
422 }
423 n_chars
424 }
425}
426
427/// Behavior to adopt regarding treating tabs as whitespace.
428///
429/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
430#[derive(Copy, Clone, Eq, PartialEq)]
431pub enum SkipTabs {
432 /// Skip all tabs as whitespace.
433 Yes,
434 /// Don't skip any tab. Return from the function when encountering one.
435 No,
436 /// Return value from the function.
437 Result(
438 /// Whether tabs were encountered.
439 bool,
440 /// Whether at least 1 valid yaml whitespace has been encountered.
441 bool,
442 ),
443}
444
445impl SkipTabs {
446 /// Whether tabs were found while skipping whitespace.
447 ///
448 /// This function must be called after a call to `skip_ws_to_eol`.
449 #[must_use]
450 pub fn found_tabs(self) -> bool {
451 matches!(self, SkipTabs::Result(true, _))
452 }
453
454 /// Whether a valid YAML whitespace has been found in skipped-over content.
455 ///
456 /// This function must be called after a call to `skip_ws_to_eol`.
457 #[must_use]
458 pub fn has_valid_yaml_ws(self) -> bool {
459 matches!(self, SkipTabs::Result(_, true))
460 }
461}