use anyhow::{Context, Result};
use super::env::SourceBlock;
use crate::error::UnexpectedEof;
#[derive(Default)]
pub struct Lexer {
blocks: Vec<SourceBlockState>,
}
impl Lexer {
pub fn push_source_block(&mut self, block: SourceBlock) {
self.blocks.push(SourceBlockState::from(block));
}
pub fn pop_source_block(&mut self) -> bool {
self.blocks.pop().is_some()
}
pub fn reset_until_base(&mut self) {
self.blocks.truncate(1);
}
pub fn get_position(&self) -> Option<LexerPosition<'_>> {
let offset = self.blocks.len();
let input = self.blocks.last()?;
Some(LexerPosition {
offset,
source_block_name: input.block.name(),
line: &input.line,
word_start: input.prev_word_start,
word_end: input.prev_word_end,
line_number: input.line_number,
})
}
pub fn depth(&self) -> i32 {
(self.blocks.len() as i32) - 1
}
pub fn scan_word(&mut self) -> Result<Option<&str>> {
let Some(input) = self.blocks.last_mut() else {
return Ok(None);
};
input.scan_word()
}
pub fn scan_until_space_or_eof(&mut self) -> Result<&str> {
if let Some(input) = self.blocks.last_mut()
&& let Some(word) = input.scan_word()?
{
return Ok(word);
}
Ok("")
}
pub fn scan_until_delimiter(&mut self, delimiter: char) -> Result<&str> {
self.use_last_block()?.scan_until(delimiter)
}
pub fn scan_classify(&mut self, delims: &str, space_class: u8) -> Result<&str> {
let Some(input) = self.blocks.last_mut() else {
return Ok("");
};
let classifier = AsciiCharClassifier::with_delims(delims, space_class)?;
input.scan_classify(&classifier)
}
pub fn rewind(&mut self, last_word_len: usize) {
if let Some(input) = self.blocks.last_mut() {
input.rewind(last_word_len)
}
}
pub fn scan_skip_whitespace(&mut self) -> Result<bool> {
if let Some(input) = self.blocks.last_mut() {
input.scan_skip_whitespace()
} else {
Ok(false)
}
}
pub fn skip_line_whitespace(&mut self) {
if let Some(input) = self.blocks.last_mut() {
input.skip_line_whitespace();
}
}
pub fn skip_symbol(&mut self) {
if let Some(input) = self.blocks.last_mut() {
input.skip_symbol();
}
}
fn use_last_block(&mut self) -> Result<&mut SourceBlockState> {
self.blocks.last_mut().ok_or_else(|| UnexpectedEof.into())
}
}
#[derive(Debug, Clone, Copy)]
pub struct LexerPosition<'a> {
pub offset: usize,
pub source_block_name: &'a str,
pub line: &'a str,
pub word_start: usize,
pub word_end: usize,
pub line_number: usize,
}
pub trait Delimiter {
fn delim(&mut self, c: char) -> bool;
}
impl<T: FnMut(char) -> bool> Delimiter for T {
fn delim(&mut self, c: char) -> bool {
(self)(c)
}
}
impl Delimiter for char {
#[inline]
fn delim(&mut self, c: char) -> bool {
*self == c
}
}
struct SourceBlockState {
block: SourceBlock,
line: String,
require_next_line: bool,
line_offset: usize,
prev_word_start: usize,
prev_word_end: usize,
line_number: usize,
}
impl From<SourceBlock> for SourceBlockState {
fn from(block: SourceBlock) -> Self {
Self {
block,
line: Default::default(),
require_next_line: true,
line_offset: 0,
prev_word_start: 0,
prev_word_end: 0,
line_number: 0,
}
}
}
impl SourceBlockState {
fn scan_word(&mut self) -> Result<Option<&str>> {
loop {
if !self.scan_skip_whitespace()? {
return Ok(None);
}
let start = self.line_offset;
self.prev_word_start = start;
self.skip_until(char::is_whitespace);
let end = self.line_offset;
self.prev_word_end = end;
self.skip_line_whitespace();
if start != end {
return Ok(Some(&self.line[start..end]));
}
}
}
fn scan_until(&mut self, c: char) -> Result<&str> {
if self.require_next_line {
self.read_line()?;
}
let start = self.line_offset;
self.prev_word_start = start;
let mut found = false;
self.skip_until(|x| {
found |= x == c;
found
});
let end = self.line_offset;
self.prev_word_end = self.line_offset;
anyhow::ensure!(found || c as u32 == 0, "End delimiter `{c}` not found");
if found {
self.skip_symbol();
} else {
self.require_next_line = true;
}
Ok(&self.line[start..end])
}
fn scan_classify(&mut self, classifier: &AsciiCharClassifier) -> Result<&str> {
self.scan_skip_whitespace()?;
let start = self.line_offset;
self.prev_word_start = start;
let mut skip = false;
let mut empty = true;
self.skip_until(|c| {
if c == '\n' || c == '\r' {
return true;
}
let class = classifier.classify(c);
if class & 0b01 != 0 && !empty {
return true;
} else if class & 0b10 != 0 {
skip = true;
return true;
}
empty = false;
false
});
if skip {
self.skip_symbol();
}
self.prev_word_end = self.line_offset;
Ok(&self.line[start..self.line_offset])
}
fn rewind(&mut self, last_word_len: usize) {
self.line_offset = self.prev_word_start + last_word_len;
self.prev_word_end = self.line_offset;
}
fn scan_skip_whitespace(&mut self) -> Result<bool> {
loop {
self.skip_line_whitespace();
if self.line_offset < self.line.len() {
return Ok(true);
}
if (self.line.is_empty() || self.line_offset >= self.line.len()) && !self.read_line()? {
return Ok(false);
}
}
}
fn skip_line_whitespace(&mut self) {
self.skip_while(char::is_whitespace)
}
fn skip_until<P: Delimiter>(&mut self, mut p: P) {
self.skip_while(|c| !p.delim(c));
}
fn skip_symbol(&mut self) {
let mut first = true;
self.skip_while(|_| std::mem::take(&mut first))
}
fn skip_while<P: Delimiter>(&mut self, mut p: P) {
let prev_offset = self.line_offset;
for (offset, c) in self.line[self.line_offset..].char_indices() {
if !p.delim(c) {
self.line_offset = prev_offset + offset;
return;
}
}
self.line_offset = self.line.len();
}
fn read_line(&mut self) -> Result<bool> {
const SKIP_PREFIX: &str = "#!";
self.require_next_line = false;
self.prev_word_start = 0;
self.prev_word_end = 0;
self.line_offset = 0;
self.line_number += 1;
self.line.clear();
let not_eof = self.block.buffer_mut().read_line(&mut self.line)? > 0;
if not_eof && self.line_number == 1 && self.line.starts_with(SKIP_PREFIX) {
self.read_line()
} else {
Ok(not_eof)
}
}
}
struct AsciiCharClassifier {
data: [u8; 64],
}
impl AsciiCharClassifier {
fn with_delims(delims: &str, space_class: u8) -> Result<Self> {
anyhow::ensure!(
delims.is_ascii(),
"Non-ascii symbols are not supported by character classifier"
);
let mut data = [0u8; 64];
let mut set_char_class = |c: u8, mut class: u8| {
class &= 0b11;
let offset = (c & 0b11) * 2;
let mask = 0b11 << offset;
class <<= offset;
let p = &mut data[(c >> 2) as usize];
*p = (*p & !mask) | class;
};
set_char_class(b' ', space_class);
set_char_class(b'\t', space_class);
let mut class = 0b11u8;
for &c in delims.as_bytes() {
if c == b' ' {
class = class.checked_sub(1).context("Too many classes")?;
} else {
set_char_class(c, class);
}
}
Ok(Self { data })
}
fn classify(&self, c: char) -> u8 {
if c.is_ascii() {
let c = c as u8;
let offset = (c & 0b11) * 2;
(self.data[(c >> 2) as usize] >> offset) & 0b11
} else {
0
}
}
}