saphyr_parser/input/
str.rs

1use crate::{
2    char_traits::{
3        is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
4    },
5    input::{Input, SkipTabs},
6};
7
8/// A parser input that uses a `&str` as source.
9#[allow(clippy::module_name_repetitions)]
10pub struct StrInput<'a> {
11    /// The input str buffer.
12    buffer: &'a str,
13    /// The number of characters we have looked ahead.
14    ///
15    /// We must however keep track of how many characters the parser asked us to look ahead for so
16    /// that we can return the correct value in [`Self::buflen`].
17    lookahead: usize,
18}
19
20impl<'a> StrInput<'a> {
21    /// Create a new [`StrInput`] with the given str.
22    #[must_use]
23    pub fn new(input: &'a str) -> Self {
24        Self {
25            buffer: input,
26            lookahead: 0,
27        }
28    }
29}
30
31impl Input for StrInput<'_> {
32    #[inline]
33    fn lookahead(&mut self, x: usize) {
34        // We already have all characters that we need.
35        // We cannot add '\0's to the buffer should we prematurely reach EOF.
36        // Returning '\0's befalls the character-retrieving functions.
37        self.lookahead = self.lookahead.max(x);
38    }
39
40    #[inline]
41    fn buflen(&self) -> usize {
42        self.lookahead
43    }
44
45    #[inline]
46    fn bufmaxlen(&self) -> usize {
47        BUFFER_LEN
48    }
49
50    fn buf_is_empty(&self) -> bool {
51        self.buflen() == 0
52    }
53
54    #[inline]
55    fn raw_read_ch(&mut self) -> char {
56        let mut chars = self.buffer.chars();
57        if let Some(c) = chars.next() {
58            self.buffer = chars.as_str();
59            c
60        } else {
61            '\0'
62        }
63    }
64
65    #[inline]
66    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
67        if let Some((c, sub_str)) = split_first_char(self.buffer) {
68            if is_breakz(c) {
69                None
70            } else {
71                self.buffer = sub_str;
72                Some(c)
73            }
74        } else {
75            None
76        }
77    }
78
79    #[inline]
80    fn skip(&mut self) {
81        let mut chars = self.buffer.chars();
82        if chars.next().is_some() {
83            self.buffer = chars.as_str();
84        }
85    }
86
87    #[inline]
88    fn skip_n(&mut self, count: usize) {
89        let mut chars = self.buffer.chars();
90        for _ in 0..count {
91            if chars.next().is_none() {
92                break;
93            }
94        }
95        self.buffer = chars.as_str();
96    }
97
98    #[inline]
99    fn peek(&self) -> char {
100        self.buffer.chars().next().unwrap_or('\0')
101    }
102
103    #[inline]
104    fn peek_nth(&self, n: usize) -> char {
105        let mut chars = self.buffer.chars();
106        for _ in 0..n {
107            if chars.next().is_none() {
108                return '\0';
109            }
110        }
111        chars.next().unwrap_or('\0')
112    }
113
114    #[inline]
115    fn look_ch(&mut self) -> char {
116        self.lookahead(1);
117        self.peek()
118    }
119
120    #[inline]
121    fn next_char_is(&self, c: char) -> bool {
122        self.peek() == c
123    }
124
125    #[inline]
126    fn nth_char_is(&self, n: usize, c: char) -> bool {
127        self.peek_nth(n) == c
128    }
129
130    #[inline]
131    fn next_2_are(&self, c1: char, c2: char) -> bool {
132        let mut chars = self.buffer.chars();
133        chars.next() == Some(c1) && chars.next() == Some(c2)
134    }
135
136    #[inline]
137    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
138        let mut chars = self.buffer.chars();
139        chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3)
140    }
141
142    #[inline]
143    fn next_is_document_indicator(&self) -> bool {
144        if self.buffer.len() < 3 {
145            false
146        } else {
147            // Since all characters we look for are ascii, we can directly use the byte API of str.
148            let bytes = self.buffer.as_bytes();
149            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
150                && (bytes[0] == b'.' || bytes[0] == b'-')
151                && bytes[0] == bytes[1]
152                && bytes[1] == bytes[2]
153        }
154    }
155
156    #[inline]
157    fn next_is_document_start(&self) -> bool {
158        if self.buffer.len() < 3 {
159            false
160        } else {
161            // Since all characters we look for are ascii, we can directly use the byte API of str.
162            let bytes = self.buffer.as_bytes();
163            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
164                && bytes[0] == b'-'
165                && bytes[1] == b'-'
166                && bytes[2] == b'-'
167        }
168    }
169
170    #[inline]
171    fn next_is_document_end(&self) -> bool {
172        if self.buffer.len() < 3 {
173            false
174        } else {
175            // Since all characters we look for are ascii, we can directly use the byte API of str.
176            let bytes = self.buffer.as_bytes();
177            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
178                && bytes[0] == b'.'
179                && bytes[1] == b'.'
180                && bytes[2] == b'.'
181        }
182    }
183
184    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
185        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
186
187        let mut new_str = self.buffer;
188        let mut has_yaml_ws = false;
189        let mut encountered_tab = false;
190
191        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
192        // while keeping track of whether we encountered spaces and/or tabs.
193        if skip_tabs == SkipTabs::Yes {
194            loop {
195                if let Some(sub_str) = new_str.strip_prefix(' ') {
196                    has_yaml_ws = true;
197                    new_str = sub_str;
198                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
199                    encountered_tab = true;
200                    new_str = sub_str;
201                } else {
202                    break;
203                }
204            }
205        } else {
206            while let Some(sub_str) = new_str.strip_prefix(' ') {
207                has_yaml_ws = true;
208                new_str = sub_str;
209            }
210        }
211
212        // All characters consumed were ascii. We can use the byte length difference to count the
213        // number of whitespace ignored.
214        let mut chars_consumed = self.buffer.len() - new_str.len();
215
216        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
217            if !encountered_tab && !has_yaml_ws {
218                return (
219                    chars_consumed,
220                    Err("comments must be separated from other tokens by whitespace"),
221                );
222            }
223
224            // Skip remaining characters until we hit a breakz.
225            while let Some((c, sub_str)) = split_first_char(new_str) {
226                if is_breakz(c) {
227                    break;
228                }
229                new_str = sub_str;
230                chars_consumed += 1;
231            }
232        }
233
234        self.buffer = new_str;
235
236        (
237            chars_consumed,
238            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
239        )
240    }
241
242    #[allow(clippy::inline_always)]
243    #[inline(always)]
244    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
245        let c = self.buffer.as_bytes()[0];
246        if self.buffer.len() > 1 {
247            let nc = self.buffer.as_bytes()[1];
248            match c {
249                // indicators can end a plain scalar, see 7.3.3. Plain Style
250                b':' if is_blank_or_breakz(nc as char) || (in_flow && is_flow(nc as char)) => false,
251                c if in_flow && is_flow(c as char) => false,
252                _ => true,
253            }
254        } else {
255            match c {
256                // indicators can end a plain scalar, see 7.3.3. Plain Style
257                b':' => false,
258                c if in_flow && is_flow(c as char) => false,
259                _ => true,
260            }
261        }
262    }
263
264    #[inline]
265    fn next_is_blank_or_break(&self) -> bool {
266        !self.buffer.is_empty()
267            && (is_blank(self.buffer.as_bytes()[0] as char)
268                || is_break(self.buffer.as_bytes()[0] as char))
269    }
270
271    #[inline]
272    fn next_is_blank_or_breakz(&self) -> bool {
273        self.buffer.is_empty()
274            || (is_blank(self.buffer.as_bytes()[0] as char)
275                || is_breakz(self.buffer.as_bytes()[0] as char))
276    }
277
278    #[inline]
279    fn next_is_blank(&self) -> bool {
280        !self.buffer.is_empty() && is_blank(self.buffer.as_bytes()[0] as char)
281    }
282
283    #[inline]
284    fn next_is_break(&self) -> bool {
285        !self.buffer.is_empty() && is_break(self.buffer.as_bytes()[0] as char)
286    }
287
288    #[inline]
289    fn next_is_breakz(&self) -> bool {
290        self.buffer.is_empty() || is_breakz(self.buffer.as_bytes()[0] as char)
291    }
292
293    #[inline]
294    fn next_is_z(&self) -> bool {
295        self.buffer.is_empty() || is_z(self.buffer.as_bytes()[0] as char)
296    }
297
298    #[inline]
299    fn next_is_flow(&self) -> bool {
300        !self.buffer.is_empty() && is_flow(self.buffer.as_bytes()[0] as char)
301    }
302
303    #[inline]
304    fn next_is_digit(&self) -> bool {
305        !self.buffer.is_empty() && is_digit(self.buffer.as_bytes()[0] as char)
306    }
307
308    #[inline]
309    fn next_is_alpha(&self) -> bool {
310        !self.buffer.is_empty() && is_alpha(self.buffer.as_bytes()[0] as char)
311    }
312
313    fn skip_while_non_breakz(&mut self) -> usize {
314        let mut new_str = self.buffer;
315        let mut count = 0;
316
317        // Skip over all non-breaks.
318        while let Some((c, sub_str)) = split_first_char(new_str) {
319            if is_breakz(c) {
320                break;
321            }
322            new_str = sub_str;
323            count += 1;
324        }
325
326        self.buffer = new_str;
327
328        count
329    }
330
331    fn skip_while_blank(&mut self) -> usize {
332        // Since all characters we look for are ascii, we can directly use the byte API of str.
333        let mut i = 0;
334        while i < self.buffer.len() {
335            if !is_blank(self.buffer.as_bytes()[i] as char) {
336                break;
337            }
338            i += 1;
339        }
340        self.buffer = &self.buffer[i..];
341        i
342    }
343
344    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
345        let mut not_alpha = None;
346
347        // Skip while we have alpha characters.
348        let mut chars = self.buffer.chars();
349        for c in chars.by_ref() {
350            if !is_alpha(c) {
351                not_alpha = Some(c);
352                break;
353            }
354        }
355
356        let remaining_string = if let Some(c) = not_alpha {
357            let n_bytes_read = chars.as_str().as_ptr() as usize - self.buffer.as_ptr() as usize;
358            let last_char_bytes = c.len_utf8();
359            &self.buffer[n_bytes_read - last_char_bytes..]
360        } else {
361            chars.as_str()
362        };
363
364        let n_bytes_to_append = remaining_string.as_ptr() as usize - self.buffer.as_ptr() as usize;
365        out.reserve(n_bytes_to_append);
366        out.push_str(&self.buffer[..n_bytes_to_append]);
367        self.buffer = remaining_string;
368
369        n_bytes_to_append
370    }
371}
372
373/// The buffer size we return to the scanner.
374///
375/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
376/// any character they want. If it's within the input buffer, the given character is returned,
377/// otherwise `\0` is returned.
378///
379/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
380/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
381/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
382/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
383/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
384/// call.
385///
386/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
387/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
388/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
389/// at a time, should fit what we expect to be a good balance between memory consumption and what
390/// we expect the maximum line length to be.
391///
392/// [`lookahead`]: `StrInput::lookahead`
393/// [`bufmaxlen`]: `StrInput::bufmaxlen`
394/// [`buflen`]: `StrInput::buflen`
395const BUFFER_LEN: usize = 128;
396
397/// Splits the first character of the given string and returns it along with the rest of the
398/// string.
399#[inline]
400fn split_first_char(s: &str) -> Option<(char, &str)> {
401    let mut chars = s.chars();
402    let c = chars.next()?;
403    Some((c, chars.as_str()))
404}
405
406#[cfg(test)]
407mod test {
408    use crate::input::Input;
409
410    use super::StrInput;
411
412    #[test]
413    pub fn is_document_start() {
414        let input = StrInput::new("---\n");
415        assert!(input.next_is_document_start());
416        assert!(input.next_is_document_indicator());
417        let input = StrInput::new("---");
418        assert!(input.next_is_document_start());
419        assert!(input.next_is_document_indicator());
420        let input = StrInput::new("...\n");
421        assert!(!input.next_is_document_start());
422        assert!(input.next_is_document_indicator());
423        let input = StrInput::new("--- ");
424        assert!(input.next_is_document_start());
425        assert!(input.next_is_document_indicator());
426    }
427
428    #[test]
429    pub fn is_document_end() {
430        let input = StrInput::new("...\n");
431        assert!(input.next_is_document_end());
432        assert!(input.next_is_document_indicator());
433        let input = StrInput::new("...");
434        assert!(input.next_is_document_end());
435        assert!(input.next_is_document_indicator());
436        let input = StrInput::new("---\n");
437        assert!(!input.next_is_document_end());
438        assert!(input.next_is_document_indicator());
439        let input = StrInput::new("... ");
440        assert!(input.next_is_document_end());
441        assert!(input.next_is_document_indicator());
442    }
443}
saphyr_parser/input/str.rs

saphyr_parser/input/
str.rs