Skip to main content

saphyr_parser_bw/input/
str.rs

1use crate::{
2    char_traits::{is_blank_or_breakz, is_breakz, is_flow},
3    input::{BorrowedInput, Input, SkipTabs},
4};
5use alloc::string::String;
6
7/// A parser input that uses a `&str` as source.
8#[allow(clippy::module_name_repetitions)]
9pub struct StrInput<'a> {
10    /// The full, original input string.
11    ///
12    /// This is kept to support O(1) byte-offset capture and zero-copy slicing via the optional
13    /// [`Input::byte_offset`] / [`Input::slice_bytes`] APIs.
14    original: &'a str,
15    /// The input str buffer.
16    ///
17    /// This is a moving window into [`Self::original`]. All consuming operations advance this
18    /// slice.
19    buffer: &'a str,
20    /// The number of characters we have looked ahead.
21    ///
22    /// We must however keep track of how many characters the parser asked us to look ahead for so
23    /// that we can return the correct value in [`Self::buflen`].
24    lookahead: usize,
25}
26
27impl<'a> StrInput<'a> {
28    /// Create a new [`StrInput`] with the given str.
29    #[must_use]
30    pub fn new(input: &'a str) -> Self {
31        Self {
32            original: input,
33            buffer: input,
34            lookahead: 0,
35        }
36    }
37
38    /// Return the number of bytes consumed from the original input.
39    ///
40    /// This is an O(1) operation derived from the invariant that [`Self::buffer`] is always a
41    /// suffix of [`Self::original`].
42    #[inline]
43    #[must_use]
44    fn consumed_bytes(&self) -> usize {
45        self.original.len() - self.buffer.len()
46    }
47}
48
49impl Input for StrInput<'_> {
50    #[inline]
51    fn lookahead(&mut self, x: usize) {
52        // We already have all characters that we need.
53        // We cannot add '\0's to the buffer should we prematurely reach EOF.
54        // Returning '\0's befalls the character-retrieving functions.
55        self.lookahead = self.lookahead.max(x);
56    }
57
58    #[inline]
59    fn buflen(&self) -> usize {
60        self.lookahead
61    }
62
63    #[inline]
64    fn bufmaxlen(&self) -> usize {
65        BUFFER_LEN
66    }
67
68    fn buf_is_empty(&self) -> bool {
69        self.buflen() == 0
70    }
71
72    #[inline]
73    fn raw_read_ch(&mut self) -> char {
74        let mut chars = self.buffer.chars();
75        if let Some(c) = chars.next() {
76            self.buffer = chars.as_str();
77            c
78        } else {
79            '\0'
80        }
81    }
82
83    #[inline]
84    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
85        if let Some((c, sub_str)) = split_first_char(self.buffer) {
86            if is_breakz(c) {
87                None
88            } else {
89                self.buffer = sub_str;
90                Some(c)
91            }
92        } else {
93            None
94        }
95    }
96
97    #[inline]
98    fn skip(&mut self) {
99        if !self.buffer.is_empty() {
100            let b = self.buffer.as_bytes()[0];
101            if b < 0x80 {
102                self.buffer = &self.buffer[1..];
103            } else {
104                let mut chars = self.buffer.chars();
105                chars.next();
106                self.buffer = chars.as_str();
107            }
108        }
109    }
110
111    #[inline]
112    fn skip_n(&mut self, count: usize) {
113        let mut chars = self.buffer.chars();
114        for _ in 0..count {
115            if chars.next().is_none() {
116                break;
117            }
118        }
119        self.buffer = chars.as_str();
120    }
121
122    #[inline]
123    fn peek(&self) -> char {
124        if self.buffer.is_empty() {
125            return '\0';
126        }
127        let b = self.buffer.as_bytes()[0];
128        if b < 0x80 {
129            b as char
130        } else {
131            self.buffer.chars().next().unwrap()
132        }
133    }
134
135    #[inline]
136    fn peek_nth(&self, n: usize) -> char {
137        if n == 0 {
138            return self.peek();
139        }
140        let bytes = self.buffer.as_bytes();
141        if n == 1 && bytes.len() >= 2 && bytes[0] < 0x80 && bytes[1] < 0x80 {
142            return bytes[1] as char;
143        }
144        let mut chars = self.buffer.chars();
145        for _ in 0..n {
146            if chars.next().is_none() {
147                return '\0';
148            }
149        }
150        chars.next().unwrap_or('\0')
151    }
152
153    #[inline]
154    fn byte_offset(&self) -> Option<usize> {
155        Some(self.consumed_bytes())
156    }
157
158    #[inline]
159    fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
160        debug_assert!(start <= end);
161        debug_assert!(end <= self.original.len());
162        self.original.get(start..end)
163    }
164
165    #[inline]
166    fn look_ch(&mut self) -> char {
167        self.lookahead(1);
168        self.peek()
169    }
170
171    #[inline]
172    fn next_char_is(&self, c: char) -> bool {
173        self.peek() == c
174    }
175
176    #[inline]
177    fn nth_char_is(&self, n: usize, c: char) -> bool {
178        self.peek_nth(n) == c
179    }
180
181    #[inline]
182    fn next_2_are(&self, c1: char, c2: char) -> bool {
183        let mut chars = self.buffer.chars();
184        chars.next() == Some(c1) && chars.next() == Some(c2)
185    }
186
187    #[inline]
188    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
189        let mut chars = self.buffer.chars();
190        chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3)
191    }
192
193    #[inline]
194    fn next_is_document_indicator(&self) -> bool {
195        if self.buffer.len() < 3 {
196            false
197        } else {
198            // Since all characters we look for are ascii, we can directly use the byte API of str.
199            let bytes = self.buffer.as_bytes();
200            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
201                && (bytes[0] == b'.' || bytes[0] == b'-')
202                && bytes[0] == bytes[1]
203                && bytes[1] == bytes[2]
204        }
205    }
206
207    #[inline]
208    fn next_is_document_start(&self) -> bool {
209        if self.buffer.len() < 3 {
210            false
211        } else {
212            // Since all characters we look for are ascii, we can directly use the byte API of str.
213            let bytes = self.buffer.as_bytes();
214            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
215                && bytes[0] == b'-'
216                && bytes[1] == b'-'
217                && bytes[2] == b'-'
218        }
219    }
220
221    #[inline]
222    fn next_is_document_end(&self) -> bool {
223        if self.buffer.len() < 3 {
224            false
225        } else {
226            // Since all characters we look for are ascii, we can directly use the byte API of str.
227            let bytes = self.buffer.as_bytes();
228            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
229                && bytes[0] == b'.'
230                && bytes[1] == b'.'
231                && bytes[2] == b'.'
232        }
233    }
234
235    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
236        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
237
238        let mut new_str = self.buffer;
239        let mut has_yaml_ws = false;
240        let mut encountered_tab = false;
241
242        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
243        // while keeping track of whether we encountered spaces and/or tabs.
244        if skip_tabs == SkipTabs::Yes {
245            loop {
246                if let Some(sub_str) = new_str.strip_prefix(' ') {
247                    has_yaml_ws = true;
248                    new_str = sub_str;
249                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
250                    encountered_tab = true;
251                    new_str = sub_str;
252                } else {
253                    break;
254                }
255            }
256        } else {
257            while let Some(sub_str) = new_str.strip_prefix(' ') {
258                has_yaml_ws = true;
259                new_str = sub_str;
260            }
261        }
262
263        // All characters consumed were ascii. We can use the byte length difference to count the
264        // number of whitespace ignored.
265        let mut chars_consumed = self.buffer.len() - new_str.len();
266
267        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
268            if !encountered_tab && !has_yaml_ws {
269                return (
270                    chars_consumed,
271                    Err("comments must be separated from other tokens by whitespace"),
272                );
273            }
274
275            // Skip remaining characters until we hit a breakz.
276            while let Some((c, sub_str)) = split_first_char(new_str) {
277                if is_breakz(c) {
278                    break;
279                }
280                new_str = sub_str;
281                chars_consumed += 1;
282            }
283        }
284
285        self.buffer = new_str;
286
287        (
288            chars_consumed,
289            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
290        )
291    }
292
293    #[allow(clippy::inline_always)]
294    #[inline(always)]
295    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
296        let nc = self.peek_nth(1);
297        match self.peek() {
298            // indicators can end a plain scalar, see 7.3.3. Plain Style
299            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
300            c if in_flow && is_flow(c) => false,
301            _ => true,
302        }
303    }
304
305    #[inline]
306    fn next_is_blank_or_break(&self) -> bool {
307        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | b'\n' | b'\r')
308    }
309
310    #[inline]
311    fn next_is_blank_or_breakz(&self) -> bool {
312        self.buffer.is_empty()
313            || matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | 0 | b'\n' | b'\r')
314    }
315
316    #[inline]
317    fn next_is_blank(&self) -> bool {
318        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t')
319    }
320
321    #[inline]
322    fn next_is_break(&self) -> bool {
323        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b'\n' | b'\r')
324    }
325
326    #[inline]
327    fn next_is_breakz(&self) -> bool {
328        self.buffer.is_empty() || matches!(self.buffer.as_bytes()[0], 0 | b'\n' | b'\r')
329    }
330
331    #[inline]
332    fn next_is_z(&self) -> bool {
333        self.buffer.is_empty() || self.buffer.as_bytes()[0] == 0
334    }
335
336    #[inline]
337    fn next_is_flow(&self) -> bool {
338        !self.buffer.is_empty()
339            && matches!(self.buffer.as_bytes()[0], b',' | b'[' | b']' | b'{' | b'}')
340    }
341
342    #[inline]
343    fn next_is_digit(&self) -> bool {
344        !self.buffer.is_empty() && self.buffer.as_bytes()[0].is_ascii_digit()
345    }
346
347    /// Check if the next character is an ASCII alphanumeric, `_`, or `-`.
348    ///
349    /// This is used as a heuristic for error detection (e.g., when `:` is followed
350    /// by tab and then a potential value character). The ASCII-only check is intentional:
351    /// it catches common cases like `key:\tvalue` while avoiding false positives for
352    /// valid YAML constructs. Unicode value starters (e.g., `äöü`) are not detected,
353    /// but such cases will still fail to parse (with a less specific error message).
354    #[inline]
355    fn next_is_alpha(&self) -> bool {
356        !self.buffer.is_empty()
357            && matches!(self.buffer.as_bytes()[0], b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-')
358    }
359
360    fn skip_while_non_breakz(&mut self) -> usize {
361        let mut byte_pos = 0;
362        let mut chars_consumed = 0;
363
364        for (i, c) in self.buffer.char_indices() {
365            if is_breakz(c) {
366                break;
367            }
368            byte_pos = i + c.len_utf8();
369            chars_consumed += 1;
370        }
371
372        self.buffer = &self.buffer[byte_pos..];
373        chars_consumed
374    }
375
376    #[inline]
377    fn skip_while_blank(&mut self) -> usize {
378        let bytes = self.buffer.as_bytes();
379
380        let mut i = 0;
381        while i < bytes.len() {
382            match bytes[i] {
383                b' ' | b'\t' => i += 1,
384                _ => break,
385            }
386        }
387
388        self.buffer = &self.buffer[i..];
389        i
390    }
391
392    /// Fetch characters matching `is_alpha` (ASCII alphanumeric, `_`, `-`).
393    ///
394    /// This is used for scanning tag handles (e.g., `!foo!`). Per YAML 1.2 spec,
395    /// tag handles use `ns-word-char` which is `[0-9a-zA-Z-]`. Our implementation
396    /// is slightly more permissive by also accepting `_`, but this is harmless
397    /// and matches common practice. Unicode characters like `ä` or `π` are NOT
398    /// valid in tag handles per spec, so the ASCII-only byte-based scanning here
399    /// is both correct and efficient.
400    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
401        let bytes = self.buffer.as_bytes();
402        let mut i = 0;
403
404        // All target characters are ASCII, so we can scan bytes directly.
405        while i < bytes.len() {
406            match bytes[i] {
407                b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-' => i += 1,
408                _ => break,
409            }
410        }
411
412        // All matched characters are ASCII, so we can safely slice and convert.
413        out.push_str(&self.buffer[..i]);
414        self.buffer = &self.buffer[i..];
415
416        i
417    }
418
419    fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
420        let byte_pos = self
421            .buffer
422            .chars()
423            .take_while(|c| crate::char_traits::is_yaml_non_space(*c))
424            .map(char::len_utf8)
425            .sum();
426
427        out.push_str(&self.buffer[..byte_pos]);
428
429        self.buffer = &self.buffer[byte_pos..];
430
431        byte_pos
432    }
433
434    fn fetch_plain_scalar_chunk(
435        &mut self,
436        out: &mut String,
437        _count: usize,
438        flow_level_gt_0: bool,
439    ) -> (bool, usize) {
440        let bytes = self.buffer.as_bytes();
441        let len = bytes.len();
442        let mut byte_pos = 0;
443        let mut chars_consumed = 0;
444
445        while byte_pos < len {
446            let b = bytes[byte_pos];
447            if b < 0x80 {
448                let c = b as char;
449                if crate::char_traits::is_blank_or_breakz(c) {
450                    out.push_str(&self.buffer[..byte_pos]);
451                    self.buffer = &self.buffer[byte_pos..];
452                    return (true, chars_consumed);
453                }
454                if flow_level_gt_0 && crate::char_traits::is_flow(c) {
455                    out.push_str(&self.buffer[..byte_pos]);
456                    self.buffer = &self.buffer[byte_pos..];
457                    return (true, chars_consumed);
458                }
459                if c == ':' {
460                    let next_byte = if byte_pos + 1 < len {
461                        bytes[byte_pos + 1]
462                    } else {
463                        0
464                    };
465                    // ASCII optimization: if next_byte >= 0x80, it is not blank/breakz/flow
466                    let is_stop = if next_byte < 0x80 {
467                        let nc = next_byte as char;
468                        crate::char_traits::is_blank_or_breakz(nc)
469                            || (flow_level_gt_0 && crate::char_traits::is_flow(nc))
470                    } else {
471                        false
472                    };
473
474                    if is_stop {
475                        out.push_str(&self.buffer[..byte_pos]);
476                        self.buffer = &self.buffer[byte_pos..];
477                        return (true, chars_consumed);
478                    }
479                }
480                byte_pos += 1;
481                chars_consumed += 1;
482            } else {
483                let mut chars = self.buffer[byte_pos..].chars();
484                let c = chars.next().unwrap();
485                byte_pos += c.len_utf8();
486                chars_consumed += 1;
487            }
488        }
489
490        out.push_str(&self.buffer[..byte_pos]);
491        self.buffer = &self.buffer[byte_pos..];
492        // If we reached here, we consumed the whole string (EOF).
493        // EOF is effectively a stop condition (breakz).
494        (true, chars_consumed)
495    }
496}
497
498impl<'a> BorrowedInput<'a> for StrInput<'a> {
499    #[inline]
500    fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str> {
501        debug_assert!(start <= end);
502        debug_assert!(end <= self.original.len());
503        self.original.get(start..end)
504    }
505}
506
507/// The buffer size we return to the scanner.
508///
509/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
510/// any character they want. If it's within the input buffer, the given character is returned,
511/// otherwise `\0` is returned.
512///
513/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
514/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
515/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
516/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
517/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
518/// call.
519///
520/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
521/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
522/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
523/// at a time, should fit what we expect to be a good balance between memory consumption and what
524/// we expect the maximum line length to be.
525///
526/// [`lookahead`]: `StrInput::lookahead`
527/// [`bufmaxlen`]: `StrInput::bufmaxlen`
528/// [`buflen`]: `StrInput::buflen`
529const BUFFER_LEN: usize = 128;
530
531/// Splits the first character of the given string and returns it along with the rest of the
532/// string.
533#[inline]
534fn split_first_char(s: &str) -> Option<(char, &str)> {
535    let mut chars = s.chars();
536    let c = chars.next()?;
537    Some((c, chars.as_str()))
538}
539
540#[cfg(test)]
541mod test {
542    use crate::input::Input;
543
544    use super::StrInput;
545
546    #[test]
547    pub fn is_document_start() {
548        let input = StrInput::new("---\n");
549        assert!(input.next_is_document_start());
550        assert!(input.next_is_document_indicator());
551        let input = StrInput::new("---");
552        assert!(input.next_is_document_start());
553        assert!(input.next_is_document_indicator());
554        let input = StrInput::new("...\n");
555        assert!(!input.next_is_document_start());
556        assert!(input.next_is_document_indicator());
557        let input = StrInput::new("--- ");
558        assert!(input.next_is_document_start());
559        assert!(input.next_is_document_indicator());
560    }
561
562    #[test]
563    pub fn is_document_end() {
564        let input = StrInput::new("...\n");
565        assert!(input.next_is_document_end());
566        assert!(input.next_is_document_indicator());
567        let input = StrInput::new("...");
568        assert!(input.next_is_document_end());
569        assert!(input.next_is_document_indicator());
570        let input = StrInput::new("---\n");
571        assert!(!input.next_is_document_end());
572        assert!(input.next_is_document_indicator());
573        let input = StrInput::new("... ");
574        assert!(input.next_is_document_end());
575        assert!(input.next_is_document_indicator());
576    }
577}