moore_common/
lexer.rs

1// Copyright (c) 2016-2021 Fabian Schuiki
2
3//! Lexer utilities.
4
5use crate::errors::DiagResult;
6use std::cmp::max;
7use std::collections::VecDeque;
8use std::io::Read;
9
10/// A trait that can supply a peekable stream of characters.
11pub trait Reader {
12    fn peek(&mut self, offset: usize) -> Option<char>;
13    fn consume(&mut self, amount: usize);
14    fn clear(&mut self);
15    fn to_string(&self) -> String;
16}
17
18/// A trait that can supply a stream of tokens.
19pub trait Lexer<T> {
20    fn next_token<'td>(&mut self) -> DiagResult<'td, T>;
21}
22
23pub struct AccumulatingReader {
24    rd: Box<dyn Read>,
25    buf: Vec<u8>,
26    base: usize,
27    pos: usize,
28    tail: usize,
29}
30
31impl AccumulatingReader {
32    pub fn new(rd: Box<dyn Read>) -> AccumulatingReader {
33        AccumulatingReader {
34            rd: rd,
35            buf: Vec::new(),
36            base: 0,
37            pos: 0,
38            tail: 0,
39        }
40    }
41
42    /// Grow and fill the internal buffer such that at least min_len characters
43    /// are present, or the end of the file has been reached. This function may
44    /// shift the buffer contents around, in which case previous buffer indices
45    /// become invalid. Recalculate all indices derived from `base`, `pos`, or
46    /// `tail` after a call to this function.
47    pub fn refill(&mut self, mut min_len: usize) {
48        // Move the buffer contents to the beginning to make more space at the
49        // end.
50        if self.base > 0 {
51            for i in 0..(self.tail - self.base) {
52                self.buf[i] = self.buf[self.base + i]
53            }
54            self.pos -= self.base;
55            self.tail -= self.base;
56            min_len -= self.base;
57            self.base = 0;
58        }
59        assert!(self.base == 0);
60
61        // Increase the buffer size to make sure the requested min_len can be
62        // accomodated.
63        let cur_len = self.buf.len();
64        if min_len > cur_len {
65            let new_len = max(cur_len * 2, 32);
66            self.buf.resize(new_len, 0);
67        }
68        assert!(self.buf.len() >= min_len);
69
70        // Keep reading data until at least min_len bytes are in the buffer.
71        // Note that this will usually fill in the entire buffer.
72        while min_len > self.tail {
73            let nread = {
74                let dst = &mut self.buf[self.tail..];
75                let nread = self.rd.read(dst).unwrap();
76                nread
77            };
78            if nread == 0 {
79                break;
80            }
81            self.tail += nread;
82        }
83    }
84
85    /// Return a slice of the consumed bytes.
86    pub fn slice(&self) -> &[u8] {
87        &self.buf[self.base..self.pos]
88    }
89
90    /// Return a slice of the remaining bytes, starting at the last call to
91    /// clear().
92    pub fn rem_slice(&self) -> &[u8] {
93        &self.buf[self.base..]
94    }
95}
96
97impl Reader for AccumulatingReader {
98    /// Return the value of the byte that is `off` bytes away from the current
99    /// position in the input file. If the `off` lies beyond the end of file,
100    /// `None` is returned.
101    fn peek(&mut self, off: usize) -> Option<char> {
102        // If the requested offset lies outside the chunk of the file that is
103        // currently in memory, refill the buffer first.
104        let idx = self.pos + off;
105        if idx >= self.tail {
106            self.refill(idx + 1)
107        }
108
109        // The previous call to refill may move things around in the buffer, so
110        // the index needs to be recalculated. If it lies beyond what we were
111        // able to read from the file, as indicated by `self.tail`, return
112        // `None` to indicate the end of the file.
113        let idx = self.pos + off;
114        if idx < self.tail {
115            Some(self.buf[idx] as char)
116        } else {
117            None
118        }
119    }
120
121    /// Consume the next `amt` bytes of the input. All consumed bytes since the
122    /// last `clear()` can be accessed via `slice()` or `to_string()`.
123    fn consume(&mut self, amt: usize) {
124        self.pos += amt
125    }
126
127    /// Clear the consumed bytes.
128    fn clear(&mut self) {
129        self.base = self.pos
130    }
131
132    /// Convert the consumed bytes to a String.
133    fn to_string(&self) -> String {
134        // TODO: Don't just unwrap, but handle the case where the input file is
135        // not valid UTF8.
136        String::from_utf8(self.slice().to_vec()).unwrap()
137    }
138}
139
140pub struct StringReader<'tdata> {
141    // data: &'tdata str,
142    iter: Box<dyn Iterator<Item = char> + 'tdata>,
143    buf: Vec<char>,
144    base: usize,
145    pos: usize,
146    tail: usize,
147}
148
149impl<'tdata> StringReader<'tdata> {
150    pub fn new(data: &'tdata str) -> StringReader<'tdata> {
151        StringReader {
152            // data: data,
153            iter: Box::new(data.chars()),
154            buf: Vec::new(),
155            base: 0,
156            pos: 0,
157            tail: 0,
158        }
159    }
160
161    fn refill(&mut self, mut min_len: usize) {
162        // Move the buffer contents to the beginning to make more space at the
163        // end.
164        if self.base > 0 {
165            for i in 0..(self.tail - self.base) {
166                self.buf[i] = self.buf[self.base + i]
167            }
168            self.pos -= self.base;
169            self.tail -= self.base;
170            min_len -= self.base;
171            self.base = 0;
172        }
173        assert!(self.base == 0);
174
175        // Increase the buffer size to make sure the requested min_len can be
176        // accomodated.
177        let cur_len = self.buf.len();
178        if min_len > cur_len {
179            let new_len = max(cur_len * 2, 32);
180            self.buf.resize(new_len, 0 as char);
181        }
182        assert!(self.buf.len() >= min_len);
183
184        // Keep reading data until at least min_len bytes are in the buffer.
185        // Note that this will usually fill in the entire buffer.
186        while min_len > self.tail {
187            match self.iter.next() {
188                Some(c) => {
189                    self.buf[self.tail] = c;
190                    self.tail += 1;
191                }
192                None => break,
193            }
194        }
195    }
196}
197
198impl<'tdata> Reader for StringReader<'tdata> {
199    fn peek(&mut self, offset: usize) -> Option<char> {
200        // If the requested offset lies outside the chunk of characters that has
201        // been parsed, refill the buffer first.
202        let idx = self.pos + offset;
203        if idx >= self.tail {
204            self.refill(idx + 1)
205        }
206
207        // The previous call to refill may move things around in the buffer, so
208        // the index needs to be recalculated. If it lies beyond what we were
209        // able to parse from the string, as indicated by `self.tail`, return
210        // `None` to indicate the end of the file.
211        let idx = self.pos + offset;
212        if idx < self.tail {
213            Some(self.buf[idx])
214        } else {
215            None
216        }
217    }
218
219    fn consume(&mut self, amount: usize) {
220        self.pos += amount;
221    }
222
223    fn clear(&mut self) {
224        self.base = self.pos;
225    }
226
227    fn to_string(&self) -> String {
228        let mut s = String::new();
229        for c in self.buf[self.base..self.pos].iter() {
230            s.push(*c);
231        }
232        s
233    }
234}
235
236/// A lexer chaining the tokens of multiple other lexers after another. Lexers
237/// can be pushed onto an internal stack and will then be queried for tokens
238/// until their respective Eof has been reached. At that point, the lexer is
239/// popped off the stack and the next lexer's tokens are produced. An Eof is
240/// returned once all lexers have been drained.
241pub struct StackedLexer<T> {
242    stack: Vec<Box<dyn Lexer<T>>>,
243    eof: T,
244}
245
246impl<T> StackedLexer<T> {
247    pub fn new(eof: T) -> StackedLexer<T> {
248        StackedLexer {
249            stack: Vec::new(),
250            eof: eof,
251        }
252    }
253
254    pub fn push(&mut self, lexer: Box<dyn Lexer<T>>) {
255        self.stack.push(lexer);
256    }
257}
258
259impl<T: Clone + PartialEq> Lexer<T> for StackedLexer<T> {
260    fn next_token<'td>(&mut self) -> DiagResult<'td, T> {
261        loop {
262            let token = if let Some(lexer) = self.stack.last_mut() {
263                lexer.next_token()?
264            } else {
265                return Ok(self.eof.clone());
266            };
267            if token == self.eof {
268                self.stack.pop();
269                continue;
270            } else {
271                return Ok(token);
272            }
273        }
274    }
275}
276
277/// A buffered lexer that allows tokens to be peeked at before they are actually
278/// consumed.
279pub struct BufferedLexer<T> {
280    inner: Box<dyn Lexer<T>>,
281    eof: T,
282    buffer: VecDeque<T>,
283    done: bool,
284}
285
286impl<T: Clone + PartialEq> BufferedLexer<T> {
287    /// Create a new buffered lexer.
288    pub fn new(inner: Box<dyn Lexer<T>>, eof: T) -> BufferedLexer<T> {
289        BufferedLexer {
290            inner: inner,
291            eof: eof,
292            buffer: VecDeque::new(),
293            done: false,
294        }
295    }
296
297    /// Peek at a token not yet consumed. This function merely returns a
298    /// reference to said token. Use `pop()` to advance the lexer.
299    pub fn peek<'td>(&mut self, offset: usize) -> DiagResult<'td, &T> {
300        // Fill the buffer up to the offset that should be peeked at. If an Eof is
301        // encountered beforehand, set the `done` flag and abort.
302        while !self.done && self.buffer.len() <= offset {
303            let token = self.inner.next_token()?;
304            if token == self.eof {
305                self.done = true;
306            }
307            self.buffer.push_back(token);
308        }
309
310        // Return the token at the requested offset, or the Eof token if the
311        // offset lies beyond the end of the stream.
312        Ok(self.buffer.get(offset).unwrap_or(&self.eof))
313        // if offset < self.buffer.len() {
314        // } else {
315        //  Ok(self.eof.clone())
316        // }
317    }
318
319    /// Insert a token in front of the stream such that it becomes the next
320    /// token to be returned from `peek(0)` or `pop()`.
321    #[allow(unused_variables)]
322    pub fn push(&mut self, token: T) {
323        unimplemented!();
324    }
325
326    /// Consume and return the current token. This is the same token that would
327    /// be returned by `peek(0)`.
328    pub fn pop<'td>(&mut self) -> DiagResult<'td, T> {
329        if self.buffer.is_empty() && !self.done {
330            self.inner.next_token()
331        } else {
332            Ok(self
333                .buffer
334                .pop_front()
335                .expect("pop() called on drained BufferedLexer"))
336        }
337    }
338
339    pub fn inner(&self) -> &dyn Lexer<T> {
340        &*self.inner
341    }
342
343    pub fn inner_mut(&mut self) -> &mut dyn Lexer<T> {
344        &mut *self.inner
345    }
346}