saphyr_parser/input/
str.rs

1use crate::{
2    char_traits::{
3        is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
4    },
5    input::{Input, SkipTabs},
6};
7
8/// A parser input that uses a `&str` as source.
9#[allow(clippy::module_name_repetitions)]
10pub struct StrInput<'a> {
11    /// The input str buffer.
12    buffer: &'a str,
13    /// The number of characters we have looked ahead.
14    ///
15    /// We must however keep track of how many characters the parser asked us to look ahead for so
16    /// that we can return the correct value in [`Self::buflen`].
17    lookahead: usize,
18}
19
20impl<'a> StrInput<'a> {
21    /// Create a new [`StrInput`] with the given str.
22    #[must_use]
23    pub fn new(input: &'a str) -> Self {
24        Self {
25            buffer: input,
26            lookahead: 0,
27        }
28    }
29}
30
31impl<'a> Input for StrInput<'a> {
32    #[inline]
33    fn lookahead(&mut self, x: usize) {
34        // We already have all characters that we need.
35        // We cannot add '\0's to the buffer should we prematurely reach EOF.
36        // Returning '\0's befalls the character-retrieving functions.
37        self.lookahead = self.lookahead.max(x);
38    }
39
40    #[inline]
41    fn buflen(&self) -> usize {
42        self.lookahead
43    }
44
45    #[inline]
46    fn bufmaxlen(&self) -> usize {
47        BUFFER_LEN
48    }
49
50    fn buf_is_empty(&self) -> bool {
51        self.buflen() == 0
52    }
53
54    #[inline]
55    fn raw_read_ch(&mut self) -> char {
56        let mut chars = self.buffer.chars();
57        if let Some(c) = chars.next() {
58            self.buffer = chars.as_str();
59            c
60        } else {
61            '\0'
62        }
63    }
64
65    #[inline]
66    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
67        if let Some((c, sub_str)) = split_first_char(self.buffer) {
68            if is_breakz(c) {
69                None
70            } else {
71                self.buffer = sub_str;
72                Some(c)
73            }
74        } else {
75            None
76        }
77    }
78
79    #[inline]
80    fn skip(&mut self) {
81        let mut chars = self.buffer.chars();
82        if chars.next().is_some() {
83            self.buffer = chars.as_str();
84        }
85    }
86
87    #[inline]
88    fn skip_n(&mut self, count: usize) {
89        let mut chars = self.buffer.chars();
90        for _ in 0..count {
91            if chars.next().is_none() {
92                break;
93            }
94        }
95        self.buffer = chars.as_str();
96    }
97
98    #[inline]
99    fn peek(&self) -> char {
100        self.buffer.chars().next().unwrap_or('\0')
101    }
102
103    #[inline]
104    fn peek_nth(&self, n: usize) -> char {
105        let mut chars = self.buffer.chars();
106        for _ in 0..n {
107            if chars.next().is_none() {
108                return '\0';
109            }
110        }
111        chars.next().unwrap_or('\0')
112    }
113
114    #[inline]
115    fn look_ch(&mut self) -> char {
116        self.lookahead(1);
117        self.peek()
118    }
119
120    #[inline]
121    fn next_char_is(&self, c: char) -> bool {
122        self.peek() == c
123    }
124
125    #[inline]
126    fn nth_char_is(&self, n: usize, c: char) -> bool {
127        self.peek_nth(n) == c
128    }
129
130    #[inline]
131    fn next_2_are(&self, c1: char, c2: char) -> bool {
132        let mut chars = self.buffer.chars();
133        chars.next().is_some_and(|c| c == c1) && chars.next().is_some_and(|c| c == c2)
134    }
135
136    #[inline]
137    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
138        let mut chars = self.buffer.chars();
139        chars.next().is_some_and(|c| c == c1)
140            && chars.next().is_some_and(|c| c == c2)
141            && chars.next().is_some_and(|c| c == c3)
142    }
143
144    #[inline]
145    fn next_is_document_indicator(&self) -> bool {
146        if self.buffer.len() < 3 {
147            false
148        } else {
149            // Since all characters we look for are ascii, we can directly use the byte API of str.
150            let bytes = self.buffer.as_bytes();
151            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
152                && (bytes[0] == b'.' || bytes[0] == b'-')
153                && bytes[0] == bytes[1]
154                && bytes[1] == bytes[2]
155        }
156    }
157
158    #[inline]
159    fn next_is_document_start(&self) -> bool {
160        if self.buffer.len() < 3 {
161            false
162        } else {
163            // Since all characters we look for are ascii, we can directly use the byte API of str.
164            let bytes = self.buffer.as_bytes();
165            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
166                && bytes[0] == b'-'
167                && bytes[1] == b'-'
168                && bytes[2] == b'-'
169        }
170    }
171
172    #[inline]
173    fn next_is_document_end(&self) -> bool {
174        if self.buffer.len() < 3 {
175            false
176        } else {
177            // Since all characters we look for are ascii, we can directly use the byte API of str.
178            let bytes = self.buffer.as_bytes();
179            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
180                && bytes[0] == b'.'
181                && bytes[1] == b'.'
182                && bytes[2] == b'.'
183        }
184    }
185
186    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
187        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
188
189        let mut new_str = self.buffer;
190        let mut has_yaml_ws = false;
191        let mut encountered_tab = false;
192
193        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
194        // while keeping track of whether we encountered spaces and/or tabs.
195        if skip_tabs == SkipTabs::Yes {
196            loop {
197                if let Some(sub_str) = new_str.strip_prefix(' ') {
198                    has_yaml_ws = true;
199                    new_str = sub_str;
200                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
201                    encountered_tab = true;
202                    new_str = sub_str;
203                } else {
204                    break;
205                }
206            }
207        } else {
208            while let Some(sub_str) = new_str.strip_prefix(' ') {
209                has_yaml_ws = true;
210                new_str = sub_str;
211            }
212        }
213
214        // All characters consumed were ascii. We can use the byte length difference to count the
215        // number of whitespace ignored.
216        let mut chars_consumed = self.buffer.len() - new_str.len();
217
218        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
219            if !encountered_tab && !has_yaml_ws {
220                return (
221                    chars_consumed,
222                    Err("comments must be separated from other tokens by whitespace"),
223                );
224            }
225
226            // Skip remaining characters until we hit a breakz.
227            while let Some((c, sub_str)) = split_first_char(new_str) {
228                if is_breakz(c) {
229                    break;
230                }
231                new_str = sub_str;
232                chars_consumed += 1;
233            }
234        }
235
236        self.buffer = new_str;
237
238        (
239            chars_consumed,
240            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
241        )
242    }
243
244    #[allow(clippy::inline_always)]
245    #[inline(always)]
246    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
247        let c = self.buffer.as_bytes()[0];
248        if self.buffer.len() > 1 {
249            let nc = self.buffer.as_bytes()[1];
250            match c {
251                // indicators can end a plain scalar, see 7.3.3. Plain Style
252                b':' if is_blank_or_breakz(nc as char) || (in_flow && is_flow(nc as char)) => false,
253                c if in_flow && is_flow(c as char) => false,
254                _ => true,
255            }
256        } else {
257            match c {
258                // indicators can end a plain scalar, see 7.3.3. Plain Style
259                b':' => false,
260                c if in_flow && is_flow(c as char) => false,
261                _ => true,
262            }
263        }
264    }
265
266    #[inline]
267    fn next_is_blank_or_break(&self) -> bool {
268        !self.buffer.is_empty()
269            && (is_blank(self.buffer.as_bytes()[0] as char)
270                || is_break(self.buffer.as_bytes()[0] as char))
271    }
272
273    #[inline]
274    fn next_is_blank_or_breakz(&self) -> bool {
275        self.buffer.is_empty()
276            || (is_blank(self.buffer.as_bytes()[0] as char)
277                || is_breakz(self.buffer.as_bytes()[0] as char))
278    }
279
280    #[inline]
281    fn next_is_blank(&self) -> bool {
282        !self.buffer.is_empty() && is_blank(self.buffer.as_bytes()[0] as char)
283    }
284
285    #[inline]
286    fn next_is_break(&self) -> bool {
287        !self.buffer.is_empty() && is_break(self.buffer.as_bytes()[0] as char)
288    }
289
290    #[inline]
291    fn next_is_breakz(&self) -> bool {
292        self.buffer.is_empty() || is_breakz(self.buffer.as_bytes()[0] as char)
293    }
294
295    #[inline]
296    fn next_is_z(&self) -> bool {
297        self.buffer.is_empty() || is_z(self.buffer.as_bytes()[0] as char)
298    }
299
300    #[inline]
301    fn next_is_flow(&self) -> bool {
302        !self.buffer.is_empty() && is_flow(self.buffer.as_bytes()[0] as char)
303    }
304
305    #[inline]
306    fn next_is_digit(&self) -> bool {
307        !self.buffer.is_empty() && is_digit(self.buffer.as_bytes()[0] as char)
308    }
309
310    #[inline]
311    fn next_is_alpha(&self) -> bool {
312        !self.buffer.is_empty() && is_alpha(self.buffer.as_bytes()[0] as char)
313    }
314
315    fn skip_while_non_breakz(&mut self) -> usize {
316        let mut new_str = self.buffer;
317        let mut count = 0;
318
319        // Skip over all non-breaks.
320        while let Some((c, sub_str)) = split_first_char(new_str) {
321            if is_breakz(c) {
322                break;
323            }
324            new_str = sub_str;
325            count += 1;
326        }
327
328        self.buffer = new_str;
329
330        count
331    }
332
333    fn skip_while_blank(&mut self) -> usize {
334        // Since all characters we look for are ascii, we can directly use the byte API of str.
335        let mut i = 0;
336        while i < self.buffer.len() {
337            if !is_blank(self.buffer.as_bytes()[i] as char) {
338                break;
339            }
340            i += 1;
341        }
342        self.buffer = &self.buffer[i..];
343        i
344    }
345
346    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
347        let mut not_alpha = None;
348
349        // Skip while we have alpha characters.
350        let mut chars = self.buffer.chars();
351        for c in chars.by_ref() {
352            if !is_alpha(c) {
353                not_alpha = Some(c);
354                break;
355            }
356        }
357
358        let remaining_string = if let Some(c) = not_alpha {
359            let n_bytes_read = chars.as_str().as_ptr() as usize - self.buffer.as_ptr() as usize;
360            let last_char_bytes = c.len_utf8();
361            &self.buffer[n_bytes_read - last_char_bytes..]
362        } else {
363            chars.as_str()
364        };
365
366        let n_bytes_to_append = remaining_string.as_ptr() as usize - self.buffer.as_ptr() as usize;
367        out.reserve(n_bytes_to_append);
368        out.push_str(&self.buffer[..n_bytes_to_append]);
369        self.buffer = remaining_string;
370
371        n_bytes_to_append
372    }
373}
374
375/// The buffer size we return to the scanner.
376///
377/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
378/// any character they want. If it's within the input buffer, the given character is returned,
379/// otherwise `\0` is returned.
380///
381/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
382/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
383/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
384/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
385/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
386/// call.
387///
388/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
389/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
390/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
391/// at a time, should fit what we expect to be a good balance between memory consumption and what
392/// we expect the maximum line length to be.
393///
394/// [`lookahead`]: `StrInput::lookahead`
395/// [`bufmaxlen`]: `StrInput::bufmaxlen`
396/// [`buflen`]: `StrInput::buflen`
397const BUFFER_LEN: usize = 128;
398
399/// Splits the first character of the given string and returns it along with the rest of the
400/// string.
401#[inline]
402fn split_first_char(s: &str) -> Option<(char, &str)> {
403    let mut chars = s.chars();
404    let c = chars.next()?;
405    Some((c, chars.as_str()))
406}
407
408#[cfg(test)]
409mod test {
410    use crate::input::Input;
411
412    use super::StrInput;
413
414    #[test]
415    pub fn is_document_start() {
416        let input = StrInput::new("---\n");
417        assert!(input.next_is_document_start());
418        assert!(input.next_is_document_indicator());
419        let input = StrInput::new("---");
420        assert!(input.next_is_document_start());
421        assert!(input.next_is_document_indicator());
422        let input = StrInput::new("...\n");
423        assert!(!input.next_is_document_start());
424        assert!(input.next_is_document_indicator());
425        let input = StrInput::new("--- ");
426        assert!(input.next_is_document_start());
427        assert!(input.next_is_document_indicator());
428    }
429
430    #[test]
431    pub fn is_document_end() {
432        let input = StrInput::new("...\n");
433        assert!(input.next_is_document_end());
434        assert!(input.next_is_document_indicator());
435        let input = StrInput::new("...");
436        assert!(input.next_is_document_end());
437        assert!(input.next_is_document_indicator());
438        let input = StrInput::new("---\n");
439        assert!(!input.next_is_document_end());
440        assert!(input.next_is_document_indicator());
441        let input = StrInput::new("... ");
442        assert!(input.next_is_document_end());
443        assert!(input.next_is_document_indicator());
444    }
445}