apollo_saphyr_parser/input/
str.rs

1use crate::{
2    char_traits::{
3        is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z,
4    },
5    input::{Input, SkipTabs},
6};
7use alloc::string::String;
8
9/// A parser input that uses a `&str` as source.
10#[allow(clippy::module_name_repetitions)]
11pub struct StrInput<'a> {
12    /// The input str buffer.
13    buffer: &'a str,
14    /// The number of characters we have looked ahead.
15    ///
16    /// We must however keep track of how many characters the parser asked us to look ahead for so
17    /// that we can return the correct value in [`Self::buflen`].
18    lookahead: usize,
19}
20
21impl<'a> StrInput<'a> {
22    /// Create a new [`StrInput`] with the given str.
23    #[must_use]
24    pub fn new(input: &'a str) -> Self {
25        Self {
26            buffer: input,
27            lookahead: 0,
28        }
29    }
30}
31
32impl Input for StrInput<'_> {
33    #[inline]
34    fn lookahead(&mut self, x: usize) {
35        // We already have all characters that we need.
36        // We cannot add '\0's to the buffer should we prematurely reach EOF.
37        // Returning '\0's befalls the character-retrieving functions.
38        self.lookahead = self.lookahead.max(x);
39    }
40
41    #[inline]
42    fn buflen(&self) -> usize {
43        self.lookahead
44    }
45
46    #[inline]
47    fn bufmaxlen(&self) -> usize {
48        BUFFER_LEN
49    }
50
51    fn buf_is_empty(&self) -> bool {
52        self.buflen() == 0
53    }
54
55    #[inline]
56    fn raw_read_ch(&mut self) -> char {
57        let mut chars = self.buffer.chars();
58        if let Some(c) = chars.next() {
59            self.buffer = chars.as_str();
60            c
61        } else {
62            '\0'
63        }
64    }
65
66    #[inline]
67    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
68        if let Some((c, sub_str)) = split_first_char(self.buffer) {
69            if is_breakz(c) {
70                None
71            } else {
72                self.buffer = sub_str;
73                Some(c)
74            }
75        } else {
76            None
77        }
78    }
79
80    #[inline]
81    fn skip(&mut self) {
82        let mut chars = self.buffer.chars();
83        if chars.next().is_some() {
84            self.buffer = chars.as_str();
85        }
86    }
87
88    #[inline]
89    fn skip_n(&mut self, count: usize) {
90        let mut chars = self.buffer.chars();
91        for _ in 0..count {
92            if chars.next().is_none() {
93                break;
94            }
95        }
96        self.buffer = chars.as_str();
97    }
98
99    #[inline]
100    fn peek(&self) -> char {
101        self.buffer.chars().next().unwrap_or('\0')
102    }
103
104    #[inline]
105    fn peek_nth(&self, n: usize) -> char {
106        let mut chars = self.buffer.chars();
107        for _ in 0..n {
108            if chars.next().is_none() {
109                return '\0';
110            }
111        }
112        chars.next().unwrap_or('\0')
113    }
114
115    #[inline]
116    fn look_ch(&mut self) -> char {
117        self.lookahead(1);
118        self.peek()
119    }
120
121    #[inline]
122    fn next_char_is(&self, c: char) -> bool {
123        self.peek() == c
124    }
125
126    #[inline]
127    fn nth_char_is(&self, n: usize, c: char) -> bool {
128        self.peek_nth(n) == c
129    }
130
131    #[inline]
132    fn next_2_are(&self, c1: char, c2: char) -> bool {
133        let mut chars = self.buffer.chars();
134        chars.next() == Some(c1) && chars.next() == Some(c2)
135    }
136
137    #[inline]
138    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
139        let mut chars = self.buffer.chars();
140        chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3)
141    }
142
143    #[inline]
144    fn next_is_document_indicator(&self) -> bool {
145        if self.buffer.len() < 3 {
146            false
147        } else {
148            // Since all characters we look for are ascii, we can directly use the byte API of str.
149            let bytes = self.buffer.as_bytes();
150            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
151                && (bytes[0] == b'.' || bytes[0] == b'-')
152                && bytes[0] == bytes[1]
153                && bytes[1] == bytes[2]
154        }
155    }
156
157    #[inline]
158    fn next_is_document_start(&self) -> bool {
159        if self.buffer.len() < 3 {
160            false
161        } else {
162            // Since all characters we look for are ascii, we can directly use the byte API of str.
163            let bytes = self.buffer.as_bytes();
164            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
165                && bytes[0] == b'-'
166                && bytes[1] == b'-'
167                && bytes[2] == b'-'
168        }
169    }
170
171    #[inline]
172    fn next_is_document_end(&self) -> bool {
173        if self.buffer.len() < 3 {
174            false
175        } else {
176            // Since all characters we look for are ascii, we can directly use the byte API of str.
177            let bytes = self.buffer.as_bytes();
178            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
179                && bytes[0] == b'.'
180                && bytes[1] == b'.'
181                && bytes[2] == b'.'
182        }
183    }
184
185    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
186        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
187
188        let mut new_str = self.buffer;
189        let mut has_yaml_ws = false;
190        let mut encountered_tab = false;
191
192        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
193        // while keeping track of whether we encountered spaces and/or tabs.
194        if skip_tabs == SkipTabs::Yes {
195            loop {
196                if let Some(sub_str) = new_str.strip_prefix(' ') {
197                    has_yaml_ws = true;
198                    new_str = sub_str;
199                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
200                    encountered_tab = true;
201                    new_str = sub_str;
202                } else {
203                    break;
204                }
205            }
206        } else {
207            while let Some(sub_str) = new_str.strip_prefix(' ') {
208                has_yaml_ws = true;
209                new_str = sub_str;
210            }
211        }
212
213        // All characters consumed were ascii. We can use the byte length difference to count the
214        // number of whitespace ignored.
215        let mut chars_consumed = self.buffer.len() - new_str.len();
216
217        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
218            if !encountered_tab && !has_yaml_ws {
219                return (
220                    chars_consumed,
221                    Err("comments must be separated from other tokens by whitespace"),
222                );
223            }
224
225            // Skip remaining characters until we hit a breakz.
226            while let Some((c, sub_str)) = split_first_char(new_str) {
227                if is_breakz(c) {
228                    break;
229                }
230                new_str = sub_str;
231                chars_consumed += 1;
232            }
233        }
234
235        self.buffer = new_str;
236
237        (
238            chars_consumed,
239            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
240        )
241    }
242
243    #[allow(clippy::inline_always)]
244    #[inline(always)]
245    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
246        let c = self.buffer.as_bytes()[0];
247        if self.buffer.len() > 1 {
248            let nc = self.buffer.as_bytes()[1];
249            match c {
250                // indicators can end a plain scalar, see 7.3.3. Plain Style
251                b':' if is_blank_or_breakz(nc as char) || (in_flow && is_flow(nc as char)) => false,
252                c if in_flow && is_flow(c as char) => false,
253                _ => true,
254            }
255        } else {
256            match c {
257                // indicators can end a plain scalar, see 7.3.3. Plain Style
258                b':' => false,
259                c if in_flow && is_flow(c as char) => false,
260                _ => true,
261            }
262        }
263    }
264
265    #[inline]
266    fn next_is_blank_or_break(&self) -> bool {
267        !self.buffer.is_empty()
268            && (is_blank(self.buffer.as_bytes()[0] as char)
269                || is_break(self.buffer.as_bytes()[0] as char))
270    }
271
272    #[inline]
273    fn next_is_blank_or_breakz(&self) -> bool {
274        self.buffer.is_empty()
275            || (is_blank(self.buffer.as_bytes()[0] as char)
276                || is_breakz(self.buffer.as_bytes()[0] as char))
277    }
278
279    #[inline]
280    fn next_is_blank(&self) -> bool {
281        !self.buffer.is_empty() && is_blank(self.buffer.as_bytes()[0] as char)
282    }
283
284    #[inline]
285    fn next_is_break(&self) -> bool {
286        !self.buffer.is_empty() && is_break(self.buffer.as_bytes()[0] as char)
287    }
288
289    #[inline]
290    fn next_is_breakz(&self) -> bool {
291        self.buffer.is_empty() || is_breakz(self.buffer.as_bytes()[0] as char)
292    }
293
294    #[inline]
295    fn next_is_z(&self) -> bool {
296        self.buffer.is_empty() || is_z(self.buffer.as_bytes()[0] as char)
297    }
298
299    #[inline]
300    fn next_is_flow(&self) -> bool {
301        !self.buffer.is_empty() && is_flow(self.buffer.as_bytes()[0] as char)
302    }
303
304    #[inline]
305    fn next_is_digit(&self) -> bool {
306        !self.buffer.is_empty() && is_digit(self.buffer.as_bytes()[0] as char)
307    }
308
309    #[inline]
310    fn next_is_alpha(&self) -> bool {
311        !self.buffer.is_empty() && is_alpha(self.buffer.as_bytes()[0] as char)
312    }
313
314    fn skip_while_non_breakz(&mut self) -> usize {
315        let mut new_str = self.buffer;
316        let mut count = 0;
317
318        // Skip over all non-breaks.
319        while let Some((c, sub_str)) = split_first_char(new_str) {
320            if is_breakz(c) {
321                break;
322            }
323            new_str = sub_str;
324            count += 1;
325        }
326
327        self.buffer = new_str;
328
329        count
330    }
331
332    fn skip_while_blank(&mut self) -> usize {
333        // Since all characters we look for are ascii, we can directly use the byte API of str.
334        let mut i = 0;
335        while i < self.buffer.len() {
336            if !is_blank(self.buffer.as_bytes()[i] as char) {
337                break;
338            }
339            i += 1;
340        }
341        self.buffer = &self.buffer[i..];
342        i
343    }
344
345    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
346        let mut not_alpha = None;
347
348        // Skip while we have alpha characters.
349        let mut chars = self.buffer.chars();
350        for c in chars.by_ref() {
351            if !is_alpha(c) {
352                not_alpha = Some(c);
353                break;
354            }
355        }
356
357        let remaining_string = if let Some(c) = not_alpha {
358            let n_bytes_read = chars.as_str().as_ptr() as usize - self.buffer.as_ptr() as usize;
359            let last_char_bytes = c.len_utf8();
360            &self.buffer[n_bytes_read - last_char_bytes..]
361        } else {
362            chars.as_str()
363        };
364
365        let n_bytes_to_append = remaining_string.as_ptr() as usize - self.buffer.as_ptr() as usize;
366        out.reserve(n_bytes_to_append);
367        out.push_str(&self.buffer[..n_bytes_to_append]);
368        self.buffer = remaining_string;
369
370        n_bytes_to_append
371    }
372}
373
374/// The buffer size we return to the scanner.
375///
376/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
377/// any character they want. If it's within the input buffer, the given character is returned,
378/// otherwise `\0` is returned.
379///
380/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
381/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
382/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
383/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
384/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
385/// call.
386///
387/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
388/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
389/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
390/// at a time, should fit what we expect to be a good balance between memory consumption and what
391/// we expect the maximum line length to be.
392///
393/// [`lookahead`]: `StrInput::lookahead`
394/// [`bufmaxlen`]: `StrInput::bufmaxlen`
395/// [`buflen`]: `StrInput::buflen`
396const BUFFER_LEN: usize = 128;
397
398/// Splits the first character of the given string and returns it along with the rest of the
399/// string.
400#[inline]
401fn split_first_char(s: &str) -> Option<(char, &str)> {
402    let mut chars = s.chars();
403    let c = chars.next()?;
404    Some((c, chars.as_str()))
405}
406
407#[cfg(test)]
408mod test {
409    use crate::input::Input;
410
411    use super::StrInput;
412
413    #[test]
414    pub fn is_document_start() {
415        let input = StrInput::new("---\n");
416        assert!(input.next_is_document_start());
417        assert!(input.next_is_document_indicator());
418        let input = StrInput::new("---");
419        assert!(input.next_is_document_start());
420        assert!(input.next_is_document_indicator());
421        let input = StrInput::new("...\n");
422        assert!(!input.next_is_document_start());
423        assert!(input.next_is_document_indicator());
424        let input = StrInput::new("--- ");
425        assert!(input.next_is_document_start());
426        assert!(input.next_is_document_indicator());
427    }
428
429    #[test]
430    pub fn is_document_end() {
431        let input = StrInput::new("...\n");
432        assert!(input.next_is_document_end());
433        assert!(input.next_is_document_indicator());
434        let input = StrInput::new("...");
435        assert!(input.next_is_document_end());
436        assert!(input.next_is_document_indicator());
437        let input = StrInput::new("---\n");
438        assert!(!input.next_is_document_end());
439        assert!(input.next_is_document_indicator());
440        let input = StrInput::new("... ");
441        assert!(input.next_is_document_end());
442        assert!(input.next_is_document_indicator());
443    }
444}
apollo_saphyr_parser/input/str.rs

apollo_saphyr_parser/input/
str.rs