// Copyright (c) 2016-2020 Fabian Schuiki
//! Lexer utilities.
use crate::errors::DiagResult;
use std::cmp::max;
use std::collections::VecDeque;
use std::io::Read;
/// A trait that can supply a peekable stream of characters.
pub trait Reader {
fn peek(&mut self, offset: usize) -> Option<char>;
fn consume(&mut self, amount: usize);
fn clear(&mut self);
fn to_string(&self) -> String;
}
/// A trait that can supply a stream of tokens.
pub trait Lexer<T> {
fn next_token<'td>(&mut self) -> DiagResult<'td, T>;
}
pub struct AccumulatingReader {
rd: Box<dyn Read>,
buf: Vec<u8>,
base: usize,
pos: usize,
tail: usize,
}
impl AccumulatingReader {
pub fn new(rd: Box<dyn Read>) -> AccumulatingReader {
AccumulatingReader {
rd: rd,
buf: Vec::new(),
base: 0,
pos: 0,
tail: 0,
}
}
/// Grow and fill the internal buffer such that at least min_len characters
/// are present, or the end of the file has been reached. This function may
/// shift the buffer contents around, in which case previous buffer indices
/// become invalid. Recalculate all indices derived from `base`, `pos`, or
/// `tail` after a call to this function.
pub fn refill(&mut self, mut min_len: usize) {
// Move the buffer contents to the beginning to make more space at the
// end.
if self.base > 0 {
for i in 0..(self.tail - self.base) {
self.buf[i] = self.buf[self.base + i]
}
self.pos -= self.base;
self.tail -= self.base;
min_len -= self.base;
self.base = 0;
}
assert!(self.base == 0);
// Increase the buffer size to make sure the requested min_len can be
// accomodated.
let cur_len = self.buf.len();
if min_len > cur_len {
let new_len = max(cur_len * 2, 32);
self.buf.resize(new_len, 0);
}
assert!(self.buf.len() >= min_len);
// Keep reading data until at least min_len bytes are in the buffer.
// Note that this will usually fill in the entire buffer.
while min_len > self.tail {
let nread = {
let dst = &mut self.buf[self.tail..];
let nread = self.rd.read(dst).unwrap();
nread
};
if nread == 0 {
break;
}
self.tail += nread;
}
}
/// Return a slice of the consumed bytes.
pub fn slice(&self) -> &[u8] {
&self.buf[self.base..self.pos]
}
/// Return a slice of the remaining bytes, starting at the last call to
/// clear().
pub fn rem_slice(&self) -> &[u8] {
&self.buf[self.base..]
}
}
impl Reader for AccumulatingReader {
/// Return the value of the byte that is `off` bytes away from the current
/// position in the input file. If the `off` lies beyond the end of file,
/// `None` is returned.
fn peek(&mut self, off: usize) -> Option<char> {
// If the requested offset lies outside the chunk of the file that is
// currently in memory, refill the buffer first.
let idx = self.pos + off;
if idx >= self.tail {
self.refill(idx + 1)
}
// The previous call to refill may move things around in the buffer, so
// the index needs to be recalculated. If it lies beyond what we were
// able to read from the file, as indicated by `self.tail`, return
// `None` to indicate the end of the file.
let idx = self.pos + off;
if idx < self.tail {
Some(self.buf[idx] as char)
} else {
None
}
}
/// Consume the next `amt` bytes of the input. All consumed bytes since the
/// last `clear()` can be accessed via `slice()` or `to_string()`.
fn consume(&mut self, amt: usize) {
self.pos += amt
}
/// Clear the consumed bytes.
fn clear(&mut self) {
self.base = self.pos
}
/// Convert the consumed bytes to a String.
fn to_string(&self) -> String {
// TODO: Don't just unwrap, but handle the case where the input file is
// not valid UTF8.
String::from_utf8(self.slice().to_vec()).unwrap()
}
}
pub struct StringReader<'tdata> {
// data: &'tdata str,
iter: Box<dyn Iterator<Item = char> + 'tdata>,
buf: Vec<char>,
base: usize,
pos: usize,
tail: usize,
}
impl<'tdata> StringReader<'tdata> {
pub fn new(data: &'tdata str) -> StringReader<'tdata> {
StringReader {
// data: data,
iter: Box::new(data.chars()),
buf: Vec::new(),
base: 0,
pos: 0,
tail: 0,
}
}
fn refill(&mut self, mut min_len: usize) {
// Move the buffer contents to the beginning to make more space at the
// end.
if self.base > 0 {
for i in 0..(self.tail - self.base) {
self.buf[i] = self.buf[self.base + i]
}
self.pos -= self.base;
self.tail -= self.base;
min_len -= self.base;
self.base = 0;
}
assert!(self.base == 0);
// Increase the buffer size to make sure the requested min_len can be
// accomodated.
let cur_len = self.buf.len();
if min_len > cur_len {
let new_len = max(cur_len * 2, 32);
self.buf.resize(new_len, 0 as char);
}
assert!(self.buf.len() >= min_len);
// Keep reading data until at least min_len bytes are in the buffer.
// Note that this will usually fill in the entire buffer.
while min_len > self.tail {
match self.iter.next() {
Some(c) => {
self.buf[self.tail] = c;
self.tail += 1;
}
None => break,
}
}
}
}
impl<'tdata> Reader for StringReader<'tdata> {
fn peek(&mut self, offset: usize) -> Option<char> {
// If the requested offset lies outside the chunk of characters that has
// been parsed, refill the buffer first.
let idx = self.pos + offset;
if idx >= self.tail {
self.refill(idx + 1)
}
// The previous call to refill may move things around in the buffer, so
// the index needs to be recalculated. If it lies beyond what we were
// able to parse from the string, as indicated by `self.tail`, return
// `None` to indicate the end of the file.
let idx = self.pos + offset;
if idx < self.tail {
Some(self.buf[idx])
} else {
None
}
}
fn consume(&mut self, amount: usize) {
self.pos += amount;
}
fn clear(&mut self) {
self.base = self.pos;
}
fn to_string(&self) -> String {
let mut s = String::new();
for c in self.buf[self.base..self.pos].iter() {
s.push(*c);
}
s
}
}
/// A lexer chaining the tokens of multiple other lexers after another. Lexers
/// can be pushed onto an internal stack and will then be queried for tokens
/// until their respective Eof has been reached. At that point, the lexer is
/// popped off the stack and the next lexer's tokens are produced. An Eof is
/// returned once all lexers have been drained.
pub struct StackedLexer<T> {
stack: Vec<Box<dyn Lexer<T>>>,
eof: T,
}
impl<T> StackedLexer<T> {
pub fn new(eof: T) -> StackedLexer<T> {
StackedLexer {
stack: Vec::new(),
eof: eof,
}
}
pub fn push(&mut self, lexer: Box<dyn Lexer<T>>) {
self.stack.push(lexer);
}
}
impl<T: Clone + PartialEq> Lexer<T> for StackedLexer<T> {
fn next_token<'td>(&mut self) -> DiagResult<'td, T> {
loop {
let token = if let Some(lexer) = self.stack.last_mut() {
lexer.next_token()?
} else {
return Ok(self.eof.clone());
};
if token == self.eof {
self.stack.pop();
continue;
} else {
return Ok(token);
}
}
}
}
/// A buffered lexer that allows tokens to be peeked at before they are actually
/// consumed.
pub struct BufferedLexer<T> {
inner: Box<dyn Lexer<T>>,
eof: T,
buffer: VecDeque<T>,
done: bool,
}
impl<T: Clone + PartialEq> BufferedLexer<T> {
/// Create a new buffered lexer.
pub fn new(inner: Box<dyn Lexer<T>>, eof: T) -> BufferedLexer<T> {
BufferedLexer {
inner: inner,
eof: eof,
buffer: VecDeque::new(),
done: false,
}
}
/// Peek at a token not yet consumed. This function merely returns a
/// reference to said token. Use `pop()` to advance the lexer.
pub fn peek<'td>(&mut self, offset: usize) -> DiagResult<'td, &T> {
// Fill the buffer up to the offset that should be peeked at. If an Eof is
// encountered beforehand, set the `done` flag and abort.
while !self.done && self.buffer.len() <= offset {
let token = self.inner.next_token()?;
if token == self.eof {
self.done = true;
}
self.buffer.push_back(token);
}
// Return the token at the requested offset, or the Eof token if the
// offset lies beyond the end of the stream.
Ok(self.buffer.get(offset).unwrap_or(&self.eof))
// if offset < self.buffer.len() {
// } else {
// Ok(self.eof.clone())
// }
}
/// Insert a token in front of the stream such that it becomes the next
/// token to be returned from `peek(0)` or `pop()`.
#[allow(unused_variables)]
pub fn push(&mut self, token: T) {
unimplemented!();
}
/// Consume and return the current token. This is the same token that would
/// be returned by `peek(0)`.
pub fn pop<'td>(&mut self) -> DiagResult<'td, T> {
if self.buffer.is_empty() && !self.done {
self.inner.next_token()
} else {
Ok(self
.buffer
.pop_front()
.expect("pop() called on drained BufferedLexer"))
}
}
pub fn inner(&self) -> &dyn Lexer<T> {
&*self.inner
}
pub fn inner_mut(&mut self) -> &mut dyn Lexer<T> {
&mut *self.inner
}
}