moore_common/lexer.rs
1// Copyright (c) 2016-2021 Fabian Schuiki
2
3//! Lexer utilities.
4
5use crate::errors::DiagResult;
6use std::cmp::max;
7use std::collections::VecDeque;
8use std::io::Read;
9
10/// A trait that can supply a peekable stream of characters.
11pub trait Reader {
12 fn peek(&mut self, offset: usize) -> Option<char>;
13 fn consume(&mut self, amount: usize);
14 fn clear(&mut self);
15 fn to_string(&self) -> String;
16}
17
18/// A trait that can supply a stream of tokens.
19pub trait Lexer<T> {
20 fn next_token<'td>(&mut self) -> DiagResult<'td, T>;
21}
22
23pub struct AccumulatingReader {
24 rd: Box<dyn Read>,
25 buf: Vec<u8>,
26 base: usize,
27 pos: usize,
28 tail: usize,
29}
30
31impl AccumulatingReader {
32 pub fn new(rd: Box<dyn Read>) -> AccumulatingReader {
33 AccumulatingReader {
34 rd: rd,
35 buf: Vec::new(),
36 base: 0,
37 pos: 0,
38 tail: 0,
39 }
40 }
41
42 /// Grow and fill the internal buffer such that at least min_len characters
43 /// are present, or the end of the file has been reached. This function may
44 /// shift the buffer contents around, in which case previous buffer indices
45 /// become invalid. Recalculate all indices derived from `base`, `pos`, or
46 /// `tail` after a call to this function.
47 pub fn refill(&mut self, mut min_len: usize) {
48 // Move the buffer contents to the beginning to make more space at the
49 // end.
50 if self.base > 0 {
51 for i in 0..(self.tail - self.base) {
52 self.buf[i] = self.buf[self.base + i]
53 }
54 self.pos -= self.base;
55 self.tail -= self.base;
56 min_len -= self.base;
57 self.base = 0;
58 }
59 assert!(self.base == 0);
60
61 // Increase the buffer size to make sure the requested min_len can be
62 // accomodated.
63 let cur_len = self.buf.len();
64 if min_len > cur_len {
65 let new_len = max(cur_len * 2, 32);
66 self.buf.resize(new_len, 0);
67 }
68 assert!(self.buf.len() >= min_len);
69
70 // Keep reading data until at least min_len bytes are in the buffer.
71 // Note that this will usually fill in the entire buffer.
72 while min_len > self.tail {
73 let nread = {
74 let dst = &mut self.buf[self.tail..];
75 let nread = self.rd.read(dst).unwrap();
76 nread
77 };
78 if nread == 0 {
79 break;
80 }
81 self.tail += nread;
82 }
83 }
84
85 /// Return a slice of the consumed bytes.
86 pub fn slice(&self) -> &[u8] {
87 &self.buf[self.base..self.pos]
88 }
89
90 /// Return a slice of the remaining bytes, starting at the last call to
91 /// clear().
92 pub fn rem_slice(&self) -> &[u8] {
93 &self.buf[self.base..]
94 }
95}
96
97impl Reader for AccumulatingReader {
98 /// Return the value of the byte that is `off` bytes away from the current
99 /// position in the input file. If the `off` lies beyond the end of file,
100 /// `None` is returned.
101 fn peek(&mut self, off: usize) -> Option<char> {
102 // If the requested offset lies outside the chunk of the file that is
103 // currently in memory, refill the buffer first.
104 let idx = self.pos + off;
105 if idx >= self.tail {
106 self.refill(idx + 1)
107 }
108
109 // The previous call to refill may move things around in the buffer, so
110 // the index needs to be recalculated. If it lies beyond what we were
111 // able to read from the file, as indicated by `self.tail`, return
112 // `None` to indicate the end of the file.
113 let idx = self.pos + off;
114 if idx < self.tail {
115 Some(self.buf[idx] as char)
116 } else {
117 None
118 }
119 }
120
121 /// Consume the next `amt` bytes of the input. All consumed bytes since the
122 /// last `clear()` can be accessed via `slice()` or `to_string()`.
123 fn consume(&mut self, amt: usize) {
124 self.pos += amt
125 }
126
127 /// Clear the consumed bytes.
128 fn clear(&mut self) {
129 self.base = self.pos
130 }
131
132 /// Convert the consumed bytes to a String.
133 fn to_string(&self) -> String {
134 // TODO: Don't just unwrap, but handle the case where the input file is
135 // not valid UTF8.
136 String::from_utf8(self.slice().to_vec()).unwrap()
137 }
138}
139
140pub struct StringReader<'tdata> {
141 // data: &'tdata str,
142 iter: Box<dyn Iterator<Item = char> + 'tdata>,
143 buf: Vec<char>,
144 base: usize,
145 pos: usize,
146 tail: usize,
147}
148
149impl<'tdata> StringReader<'tdata> {
150 pub fn new(data: &'tdata str) -> StringReader<'tdata> {
151 StringReader {
152 // data: data,
153 iter: Box::new(data.chars()),
154 buf: Vec::new(),
155 base: 0,
156 pos: 0,
157 tail: 0,
158 }
159 }
160
161 fn refill(&mut self, mut min_len: usize) {
162 // Move the buffer contents to the beginning to make more space at the
163 // end.
164 if self.base > 0 {
165 for i in 0..(self.tail - self.base) {
166 self.buf[i] = self.buf[self.base + i]
167 }
168 self.pos -= self.base;
169 self.tail -= self.base;
170 min_len -= self.base;
171 self.base = 0;
172 }
173 assert!(self.base == 0);
174
175 // Increase the buffer size to make sure the requested min_len can be
176 // accomodated.
177 let cur_len = self.buf.len();
178 if min_len > cur_len {
179 let new_len = max(cur_len * 2, 32);
180 self.buf.resize(new_len, 0 as char);
181 }
182 assert!(self.buf.len() >= min_len);
183
184 // Keep reading data until at least min_len bytes are in the buffer.
185 // Note that this will usually fill in the entire buffer.
186 while min_len > self.tail {
187 match self.iter.next() {
188 Some(c) => {
189 self.buf[self.tail] = c;
190 self.tail += 1;
191 }
192 None => break,
193 }
194 }
195 }
196}
197
198impl<'tdata> Reader for StringReader<'tdata> {
199 fn peek(&mut self, offset: usize) -> Option<char> {
200 // If the requested offset lies outside the chunk of characters that has
201 // been parsed, refill the buffer first.
202 let idx = self.pos + offset;
203 if idx >= self.tail {
204 self.refill(idx + 1)
205 }
206
207 // The previous call to refill may move things around in the buffer, so
208 // the index needs to be recalculated. If it lies beyond what we were
209 // able to parse from the string, as indicated by `self.tail`, return
210 // `None` to indicate the end of the file.
211 let idx = self.pos + offset;
212 if idx < self.tail {
213 Some(self.buf[idx])
214 } else {
215 None
216 }
217 }
218
219 fn consume(&mut self, amount: usize) {
220 self.pos += amount;
221 }
222
223 fn clear(&mut self) {
224 self.base = self.pos;
225 }
226
227 fn to_string(&self) -> String {
228 let mut s = String::new();
229 for c in self.buf[self.base..self.pos].iter() {
230 s.push(*c);
231 }
232 s
233 }
234}
235
236/// A lexer chaining the tokens of multiple other lexers after another. Lexers
237/// can be pushed onto an internal stack and will then be queried for tokens
238/// until their respective Eof has been reached. At that point, the lexer is
239/// popped off the stack and the next lexer's tokens are produced. An Eof is
240/// returned once all lexers have been drained.
241pub struct StackedLexer<T> {
242 stack: Vec<Box<dyn Lexer<T>>>,
243 eof: T,
244}
245
246impl<T> StackedLexer<T> {
247 pub fn new(eof: T) -> StackedLexer<T> {
248 StackedLexer {
249 stack: Vec::new(),
250 eof: eof,
251 }
252 }
253
254 pub fn push(&mut self, lexer: Box<dyn Lexer<T>>) {
255 self.stack.push(lexer);
256 }
257}
258
259impl<T: Clone + PartialEq> Lexer<T> for StackedLexer<T> {
260 fn next_token<'td>(&mut self) -> DiagResult<'td, T> {
261 loop {
262 let token = if let Some(lexer) = self.stack.last_mut() {
263 lexer.next_token()?
264 } else {
265 return Ok(self.eof.clone());
266 };
267 if token == self.eof {
268 self.stack.pop();
269 continue;
270 } else {
271 return Ok(token);
272 }
273 }
274 }
275}
276
277/// A buffered lexer that allows tokens to be peeked at before they are actually
278/// consumed.
279pub struct BufferedLexer<T> {
280 inner: Box<dyn Lexer<T>>,
281 eof: T,
282 buffer: VecDeque<T>,
283 done: bool,
284}
285
286impl<T: Clone + PartialEq> BufferedLexer<T> {
287 /// Create a new buffered lexer.
288 pub fn new(inner: Box<dyn Lexer<T>>, eof: T) -> BufferedLexer<T> {
289 BufferedLexer {
290 inner: inner,
291 eof: eof,
292 buffer: VecDeque::new(),
293 done: false,
294 }
295 }
296
297 /// Peek at a token not yet consumed. This function merely returns a
298 /// reference to said token. Use `pop()` to advance the lexer.
299 pub fn peek<'td>(&mut self, offset: usize) -> DiagResult<'td, &T> {
300 // Fill the buffer up to the offset that should be peeked at. If an Eof is
301 // encountered beforehand, set the `done` flag and abort.
302 while !self.done && self.buffer.len() <= offset {
303 let token = self.inner.next_token()?;
304 if token == self.eof {
305 self.done = true;
306 }
307 self.buffer.push_back(token);
308 }
309
310 // Return the token at the requested offset, or the Eof token if the
311 // offset lies beyond the end of the stream.
312 Ok(self.buffer.get(offset).unwrap_or(&self.eof))
313 // if offset < self.buffer.len() {
314 // } else {
315 // Ok(self.eof.clone())
316 // }
317 }
318
319 /// Insert a token in front of the stream such that it becomes the next
320 /// token to be returned from `peek(0)` or `pop()`.
321 #[allow(unused_variables)]
322 pub fn push(&mut self, token: T) {
323 unimplemented!();
324 }
325
326 /// Consume and return the current token. This is the same token that would
327 /// be returned by `peek(0)`.
328 pub fn pop<'td>(&mut self) -> DiagResult<'td, T> {
329 if self.buffer.is_empty() && !self.done {
330 self.inner.next_token()
331 } else {
332 Ok(self
333 .buffer
334 .pop_front()
335 .expect("pop() called on drained BufferedLexer"))
336 }
337 }
338
339 pub fn inner(&self) -> &dyn Lexer<T> {
340 &*self.inner
341 }
342
343 pub fn inner_mut(&mut self) -> &mut dyn Lexer<T> {
344 &mut *self.inner
345 }
346}