Skip to main content

granit_parser/input/
buffered.rs

1use crate::char_traits::is_breakz;
2use crate::input::{BorrowedInput, Input};
3
4use arraydeque::ArrayDeque;
5
6/// The size of the [`BufferedInput`] buffer.
7///
8/// The buffer is statically allocated to avoid conditions for reallocations each time we
9/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
10///   - Escape sequences parsing: some escape codes are 8 characters
11///   - Scanning indent in scalars: this looks ahead `indent + 2` characters
12///
13/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
14/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
15/// than that, the code will fall back to a loop of lookaheads.
16const BUFFER_LEN: usize = 16;
17
18/// A wrapper around an [`Iterator`] of [`char`]s with a buffer.
19///
20/// The YAML scanner often needs some lookahead. With fully allocated buffers such as `String` or
21/// `&str`, this is not an issue. However, with streams, we need to have a way of peeking multiple
22/// characters at a time and sometimes pushing some back into the stream.
23/// Doing this directly with iterator adapters would require pulling in all of `itertools` for one
24/// method, so this structure keeps the buffering local.
25#[allow(clippy::module_name_repetitions)]
26pub struct BufferedInput<T: Iterator<Item = char>> {
27    /// The iterator source.
28    input: T,
29    /// Buffer for the next characters to consume.
30    buffer: ArrayDeque<char, BUFFER_LEN>,
31    /// Number of front buffer characters that came from the iterator, not EOF padding.
32    real_buffered: usize,
33    /// Largest active lookahead window requested by the scanner.
34    lookahead: usize,
35    /// Whether the wrapped iterator has reported EOF.
36    source_exhausted: bool,
37}
38
39impl<T: Iterator<Item = char>> BufferedInput<T> {
40    /// Create a new [`BufferedInput`] over the given character iterator.
41    pub fn new(input: T) -> Self {
42        Self {
43            input,
44            buffer: ArrayDeque::default(),
45            real_buffered: 0,
46            lookahead: 0,
47            source_exhausted: false,
48        }
49    }
50
51    fn push_source_or_padding(&mut self) {
52        let c = if self.source_exhausted {
53            '\0'
54        } else if let Some(c) = self.input.next() {
55            self.real_buffered += 1;
56            c
57        } else {
58            self.source_exhausted = true;
59            '\0'
60        };
61        self.buffer.push_back(c).unwrap();
62    }
63
64    fn fill_lookahead(&mut self) {
65        while self.buffer.len() < self.lookahead {
66            self.push_source_or_padding();
67        }
68    }
69
70    fn pop_buffered(&mut self) -> Option<(char, bool)> {
71        let c = self.buffer.pop_front()?;
72        let is_real = self.real_buffered > 0;
73        if is_real {
74            self.real_buffered -= 1;
75        }
76        Some((c, is_real))
77    }
78
79    fn read_source_or_eof(&mut self) -> (char, bool) {
80        if self.source_exhausted {
81            ('\0', false)
82        } else if let Some(c) = self.input.next() {
83            (c, true)
84        } else {
85            self.source_exhausted = true;
86            ('\0', false)
87        }
88    }
89
90    fn raw_read_front(&mut self) -> (char, bool) {
91        let read = self
92            .pop_buffered()
93            .unwrap_or_else(|| self.read_source_or_eof());
94        self.fill_lookahead();
95        read
96    }
97
98    fn skip_one(&mut self) -> bool {
99        let skipped = match self.pop_buffered() {
100            Some((_, true)) => true,
101            Some((_, false)) => {
102                self.buffer.push_front('\0').unwrap();
103                false
104            }
105            None => self.read_source_or_eof().1,
106        };
107
108        if skipped {
109            self.fill_lookahead();
110        }
111        skipped
112    }
113}
114
115impl<T: Iterator<Item = char>> Input for BufferedInput<T> {
116    #[inline]
117    fn lookahead(&mut self, count: usize) {
118        self.lookahead = self.lookahead.max(count.min(BUFFER_LEN));
119        self.fill_lookahead();
120    }
121
122    #[inline]
123    fn buflen(&self) -> usize {
124        self.lookahead
125    }
126
127    #[inline]
128    fn bufmaxlen(&self) -> usize {
129        BUFFER_LEN
130    }
131
132    #[inline]
133    fn raw_read_ch(&mut self) -> char {
134        self.raw_read_front().0
135    }
136
137    #[inline]
138    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
139        if let Some(c) = self.buffer.front().copied() {
140            if is_breakz(c) {
141                None
142            } else {
143                Some(self.raw_read_front().0)
144            }
145        } else {
146            let (c, is_real) = self.read_source_or_eof();
147            if !is_real {
148                None
149            } else if is_breakz(c) {
150                self.buffer.push_back(c).unwrap();
151                self.real_buffered += 1;
152                None
153            } else {
154                self.fill_lookahead();
155                Some(c)
156            }
157        }
158    }
159
160    #[inline]
161    fn skip(&mut self) {
162        self.skip_one();
163    }
164
165    #[inline]
166    fn skip_n(&mut self, count: usize) {
167        for _ in 0..count {
168            if !self.skip_one() {
169                break;
170            }
171        }
172    }
173
174    #[inline]
175    fn peek(&self) -> char {
176        self.buffer.front().copied().unwrap_or('\0')
177    }
178
179    #[inline]
180    fn peek_nth(&self, n: usize) -> char {
181        self.buffer.get(n).copied().unwrap_or('\0')
182    }
183}
184
185/// `BufferedInput` does not support zero-copy slicing since it's a streaming input
186/// without stable backing storage.
187impl<T: Iterator<Item = char>> BorrowedInput<'static> for BufferedInput<T> {
188    #[inline]
189    fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'static str> {
190        None
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use crate::input::str::StrInput;
197
198    use super::*;
199
200    #[test]
201    fn lookahead_larger_than_buffer_is_clamped() {
202        let mut input = BufferedInput::new("abc".chars());
203
204        input.lookahead(BUFFER_LEN + 8);
205
206        assert_eq!(input.buflen(), BUFFER_LEN);
207        assert_eq!(input.peek(), 'a');
208        assert_eq!(input.peek_nth(1), 'b');
209        assert_eq!(input.peek_nth(2), 'c');
210        assert_eq!(input.peek_nth(3), '\0');
211    }
212
213    #[test]
214    fn raw_reads_use_stream_front_and_report_eof() {
215        let mut input = BufferedInput::new("a".chars());
216
217        assert_eq!(input.raw_read_ch(), 'a');
218        assert_eq!(input.raw_read_ch(), '\0');
219
220        let mut input = BufferedInput::new("ab".chars());
221        input.lookahead(1);
222        assert_eq!(input.raw_read_ch(), 'a');
223        assert_eq!(input.peek(), 'b');
224    }
225
226    #[test]
227    fn raw_read_non_breakz_leaves_break_at_stream_front() {
228        let mut input = BufferedInput::new("a\n".chars());
229
230        assert_eq!(input.raw_read_non_breakz_ch(), Some('a'));
231        assert_eq!(input.raw_read_non_breakz_ch(), None);
232        assert_eq!(input.peek(), '\n');
233        input.lookahead(1);
234        assert_eq!(input.buflen(), 1);
235        assert_eq!(input.peek(), '\n');
236
237        let mut empty = BufferedInput::new("".chars());
238        assert_eq!(empty.raw_read_non_breakz_ch(), None);
239    }
240
241    #[test]
242    fn skip_n_consumes_stream_front_and_preserves_lookahead_window() {
243        let mut input = BufferedInput::new("abcdef".chars());
244
245        input.lookahead(5);
246        input.skip_n(2);
247
248        assert_eq!(input.buflen(), 5);
249        assert_eq!(input.peek(), 'c');
250        assert_eq!(input.peek_nth(3), 'f');
251        assert_eq!(input.peek_nth(4), '\0');
252    }
253
254    #[test]
255    fn skip_without_lookahead_consumes_like_str_input() {
256        let mut buffered = BufferedInput::new("ab".chars());
257        buffered.skip();
258        buffered.lookahead(1);
259
260        let mut str_input = StrInput::new("ab");
261        str_input.skip();
262        str_input.lookahead(1);
263
264        assert_eq!(buffered.peek(), str_input.peek());
265    }
266
267    #[test]
268    fn skip_n_saturates_at_eof_like_str_input() {
269        let mut buffered = BufferedInput::new("abc".chars());
270        buffered.lookahead(1);
271        buffered.skip_n(8);
272        buffered.lookahead(1);
273
274        let mut str_input = StrInput::new("abc");
275        str_input.lookahead(1);
276        str_input.skip_n(8);
277        str_input.lookahead(1);
278
279        assert_eq!(buffered.peek(), str_input.peek());
280    }
281
282    #[test]
283    fn buflen_matches_str_input_lookahead_window_after_consumption() {
284        let mut buffered = BufferedInput::new("ab".chars());
285        buffered.lookahead(2);
286        buffered.skip();
287        buffered.skip();
288
289        let mut str_input = StrInput::new("ab");
290        str_input.lookahead(2);
291        str_input.skip();
292        str_input.skip();
293
294        assert_eq!(buffered.buflen(), str_input.buflen());
295        assert_eq!(buffered.buf_is_empty(), str_input.buf_is_empty());
296        assert_eq!(buffered.peek(), str_input.peek());
297    }
298
299    #[test]
300    fn streaming_input_never_borrows_slices() {
301        let input = BufferedInput::new("abc".chars());
302
303        assert_eq!(BorrowedInput::slice_borrowed(&input, 0, 1), None);
304    }
305}