pub(crate) mod tests;
use std::collections::HashMap;
use std::fmt::{Display, Formatter};
use std::io::Read;
use std::ops::{Add, AddAssign};
use crate::segmap::{char_to_group, GroupId, SegMap};
use crate::char_reader::{escape_char, CharReader};
use crate::TokenId;
pub type StateId = usize;
pub type ChannelId = u16;
pub type ModeId = u16;
#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
pub struct Terminal {
pub action: ActionOption,
pub channel: ChannelId,
pub mode: ModeOption,
pub mode_state: Option<StateId>,
pub pop: bool
}
impl Terminal {
#[inline]
pub fn is_only_skip(&self) -> bool {
self.action.is_skip() && self.mode.is_none() && self.mode_state.is_none() && !self.pop
}
#[inline]
pub fn is_token(&self) -> bool {
self.action.is_token()
}
#[inline]
pub fn get_token(&self) -> Option<TokenId> {
self.action.get_token()
}
pub fn to_macro(&self) -> String {
let mut str = Vec::<String>::new();
match self.action {
ActionOption::Skip => str.push("term!(skip)".to_string()),
ActionOption::Token(t) => str.push(format!("term!(={t})")),
ActionOption::More => str.push("term!(more)".to_string())
}
if self.channel != 0 {
str.push(format!("term!(#{})", self.channel));
}
match self.mode {
ModeOption::None => {}
ModeOption::Mode(m) => str.push(format!("term!(mode {m})")),
ModeOption::Push(m) => str.push(format!("term!(push {m})")),
}
if let Some(id) = self.mode_state {
str.push(format!("term!(pushst {})", id));
}
if self.pop {
str.push("term!(pop)".to_string());
}
str.join(" + ")
}
}
impl Display for Terminal {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "<{}", self.action)?;
if self.channel != 0 { write!(f, ",ch {}", self.channel)?; }
if !self.mode.is_none() || self.mode_state.is_some() {
match self.mode {
ModeOption::None => {}
ModeOption::Mode(m) => write!(f, ",mode({m}")?,
ModeOption::Push(m) => write!(f, ",push({m}")?,
}
if let Some(s) = self.mode_state { write!(f, ",state {s}")?; }
write!(f, ")")?;
}
if self.pop { write!(f, ",pop")?; }
write!(f, ">")
}
}
impl Add for Terminal {
type Output = Terminal;
fn add(self, rhs: Self) -> Self::Output {
Terminal {
action: self.action + rhs.action,
channel: self.channel + rhs.channel,
mode: if !self.mode.is_none() { self.mode } else { rhs.mode },
mode_state: if self.mode_state.is_some() { self.mode_state } else { rhs.mode_state },
pop: self.pop || rhs.pop
}
}
}
#[derive(Clone, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
pub enum ActionOption {
#[default] Skip,
Token(TokenId),
More
}
impl ActionOption {
pub fn is_skip(&self) -> bool { self == &ActionOption::Skip }
pub fn is_token(&self) -> bool { matches!(self, ActionOption::Token(_) ) }
pub fn is_more(&self) -> bool { self == &ActionOption::More }
pub fn get_token(&self) -> Option<TokenId> {
if let ActionOption::Token(token) = self {
Some(*token)
} else {
None
}
}
}
impl Add for ActionOption {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
match self {
ActionOption::Skip => rhs,
_ => if rhs.is_skip() { self } else { panic!("can't add {self:?} and {rhs:?}") }
}
}
}
impl Display for ActionOption {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
ActionOption::Skip => write!(f, "skip"),
ActionOption::Token(t) => write!(f, "end:{t}"),
ActionOption::More => write!(f, "more")
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Default, PartialOrd, Eq, Ord)]
pub enum ModeOption {
#[default]
None,
Mode(ModeId),
Push(ModeId)
}
impl ModeOption {
pub fn is_none(&self) -> bool {
self == &ModeOption::None
}
pub fn is_mode(&self) -> bool {
matches!(self, &ModeOption::Mode(_))
}
pub fn is_push(&self) -> bool {
matches!(self, &ModeOption::Push(_))
}
}
pub type CaretCol = u64;
pub type CaretLine = u64;
#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
pub struct Pos(pub CaretLine, pub CaretCol);
impl Pos {
pub fn line(&self) -> CaretLine {
self.0
}
pub fn col(&self) -> CaretCol {
self.1
}
pub fn update_pos(&mut self, c: char, tab_width: CaretCol) {
match c {
'\t' => {
self.1 = self.1 - (self.1 - 1) % tab_width + tab_width;
}
'\n' => {
self.0 += 1;
self.1 = 1;
}
'\r' => {}
_ => self.1 += 1,
}
}
}
impl Display for Pos {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.0, self.1)
}
}
#[derive(Clone, PartialEq, Debug)]
pub struct PosSpan {
pub first: Pos,
pub last: Pos,
}
impl PosSpan {
#[inline(always)]
pub fn new(first: Pos, last: Pos) -> Self {
PosSpan { first, last }
}
#[inline(always)]
pub fn empty() -> Self {
PosSpan { first: Pos(1, 1), last: Pos(0, 0) }
}
pub fn take(&mut self) -> PosSpan {
std::mem::take(self)
}
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.first > self.last
}
#[inline(always)]
pub fn is_not_empty(&self) -> bool {
self.first <= self.last
}
pub fn first(&self) -> Option<Pos> {
if self.is_not_empty() { Some(self.first) } else { None }
}
pub fn first_forced(&self) -> Pos {
if self.is_not_empty() { self.first } else { panic!("span is empty") }
}
pub fn last(&self) -> Option<Pos> {
if self.is_not_empty() { Some(self.last) } else { None }
}
pub fn last_forced(&self) -> Pos {
if self.is_not_empty() { self.last } else { panic!("span is empty") }
}
}
impl AddAssign<&PosSpan> for PosSpan {
fn add_assign(&mut self, rhs: &Self) {
match (self.is_empty(), rhs.is_empty()) {
(true, false) => (self.first, self.last) = (rhs.first, rhs.last),
(false, false) => self.last = rhs.last,
_ => {}
}
}
}
impl Add<&PosSpan> for &PosSpan {
type Output = PosSpan;
fn add(self, rhs: &PosSpan) -> Self::Output {
let mut sum = self.clone();
sum += rhs;
sum
}
}
impl Add<&PosSpan> for PosSpan {
type Output = PosSpan;
fn add(self, rhs: &PosSpan) -> Self::Output {
let mut sum = self.clone();
sum += rhs;
sum
}
}
impl Add<PosSpan> for PosSpan {
type Output = PosSpan;
fn add(mut self, rhs: PosSpan) -> Self::Output {
self += &rhs;
self
}
}
impl Default for PosSpan {
fn default() -> Self {
PosSpan::empty()
}
}
impl Display for PosSpan {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
if self.is_not_empty() {
let (first, last) = (&self.first, &self.last);
if first == last {
write!(f, "{first}")
} else if first.0 == last.0 {
write!(f, "{first}-{}", last.1)
} else {
write!(f, "{first}-{last}")
}
} else {
write!(f, "<empty>")
}
}
}
#[derive(Clone, PartialEq, Debug)]
pub struct LexerErrorInfo {
pub pos: u64,
pub line: CaretLine,
pub col: CaretCol,
pub curr_char: Option<char>,
pub group: GroupId,
pub state: StateId,
pub text: String,
}
#[derive(Clone, PartialEq, Debug)]
pub enum LexerError {
None,
NoStreamAttached,
EndOfStream { info: LexerErrorInfo },
InvalidChar { info: LexerErrorInfo },
UnrecognizedChar { info: LexerErrorInfo },
InfiniteLoop { pos: u64 },
EmptyStateStack { info: LexerErrorInfo }
}
impl Display for LexerError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
LexerError::None => write!(f, "no error"),
LexerError::NoStreamAttached => write!(f, "no stream attached"),
LexerError::EndOfStream { info: LexerErrorInfo { pos, line, col, ..} } =>
write!(f, "end of stream, line {line}, col {col} (stream pos = {pos})"),
LexerError::InvalidChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
write!(f, "invalid character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
write!(f, "unrecognized character '{}', line {line}, col {col} (stream pos = {pos})", curr_char.unwrap()),
LexerError::InfiniteLoop { pos } =>
write!(f, "infinite loop (stream pos = {pos})"),
LexerError::EmptyStateStack { info: LexerErrorInfo { pos, line, col, curr_char, .. } } =>
write!(f, "pop from empty stack, line {line}, col {col}{} (stream pos = {pos})",
if let Some(c) = curr_char { format!(", chr = '{c}'") } else { String::new() })
}
}
}
impl LexerError {
pub fn get_pos(&self) -> Option<u64> {
match &self {
LexerError::EndOfStream { info: LexerErrorInfo { pos, .. } }
| LexerError::InvalidChar { info: LexerErrorInfo { pos, .. } }
| LexerError::UnrecognizedChar { info: LexerErrorInfo { pos, .. } }
| LexerError::InfiniteLoop { pos }
| LexerError::EmptyStateStack { info: LexerErrorInfo { pos, .. } } => Some(*pos),
_ => None
}
}
pub fn get_line_col(&self) -> Option<(CaretLine, CaretCol)> {
match &self {
LexerError::EndOfStream { info: LexerErrorInfo { line, col, .. } }
| LexerError::InvalidChar { info: LexerErrorInfo { line, col, .. } }
| LexerError::UnrecognizedChar { info: LexerErrorInfo { line, col, .. } }
| LexerError::EmptyStateStack { info: LexerErrorInfo { line, col, .. } } => Some((*line, *col)),
_ => None
}
}
}
pub type LexerToken = (TokenId, ChannelId, String, PosSpan);
pub struct Lexer<'a, R> {
pub(crate) input: Option<CharReader<R>>,
pub(crate) error: LexerError,
pub(crate) is_eos: bool,
pub(crate) pos: u64,
pub(crate) cursor: Pos,
pub(crate) tab_width: CaretCol,
pub(crate) state_stack: Vec<StateId>,
pub(crate) start_state: StateId,
pub nbr_groups: u32,
pub initial_state: StateId,
pub first_end_state: StateId, pub nbr_states: StateId, pub ascii_to_group: &'a [GroupId],
pub utf8_to_group: HashMap<char, GroupId>,
pub seg_to_group: SegMap<GroupId>,
pub state_table: &'a [StateId],
pub terminal_table: &'a [Terminal], }
impl<'a, R: Read> Lexer<'a, R> {
pub fn new(
nbr_groups: u32,
initial_state: StateId,
first_end_state: StateId, nbr_states: StateId, ascii_to_group: &'a [GroupId],
utf8_to_group: HashMap<char, GroupId>,
seg_to_group: SegMap<GroupId>,
state_table: &'a [StateId],
terminal_table: &'a [Terminal], ) -> Self {
Lexer {
input: None,
error: LexerError::None,
is_eos: false,
pos: 0,
cursor: Pos(1, 1),
tab_width: 4,
state_stack: Vec::new(),
start_state: 0,
nbr_groups,
initial_state,
first_end_state,
nbr_states,
ascii_to_group,
utf8_to_group,
seg_to_group,
state_table,
terminal_table,
}
}
pub fn attach_stream(&mut self, input: CharReader<R>) {
self.input = Some(input);
self.is_eos = false;
self.pos = 0;
self.cursor = Pos(1, 1);
self.state_stack.clear();
self.start_state = self.initial_state;
}
pub fn detach_stream(&mut self) -> Option<CharReader<R>> {
self.input.take()
}
pub fn set_tab_width(&mut self, width: CaretCol) {
self.tab_width = width;
}
pub fn get_tab_width(&self) -> CaretCol {
self.tab_width
}
pub fn stream(&self) -> Option<&CharReader<R>> {
self.input.as_ref()
}
pub fn is_open(&self) -> bool {
self.input.as_ref().map(|input| input.is_reading()).unwrap_or(false)
}
pub fn tokens(&mut self) -> LexInterpretIter<'_, 'a, R> {
LexInterpretIter { lexer: self, error_info: None, mode: LexInterpretIterMode::Normal }
}
pub fn get_token(&mut self) -> Result<Option<LexerToken>, LexerError> {
const VERBOSE: bool = false;
if VERBOSE { println!("lexer state_table: {}, last: {}", self.state_table.len(), self.state_table.iter().last().unwrap()); }
self.error = LexerError::None;
let mut text = String::new();
let mut more_text = String::new(); if self.input.is_some() {
let mut state = self.start_state;
let mut first_pos = self.cursor;
let mut last_pos = first_pos;
#[cfg(debug_assertions)] let mut last_state: Option<StateId> = None;
#[cfg(debug_assertions)] let mut last_offset: Option<u64> = None;
#[cfg(debug_assertions)] let mut infinite_loop_cnt = 0_u32;
loop {
if VERBOSE { print!("- state = {state}"); }
#[allow(clippy::unnecessary_unwrap)] let input = self.input.as_mut().unwrap();
#[cfg(debug_assertions)] {
if last_state.map(|st| st == state).unwrap_or(false) && last_offset.map(|offset| offset == input.get_offset()).unwrap_or(false) {
if infinite_loop_cnt > 3 {
self.error = LexerError::InfiniteLoop { pos: self.pos };
if VERBOSE { println!(" => Err({})", self.error); }
return Err(self.error.clone());
}
infinite_loop_cnt += 1;
} else {
infinite_loop_cnt = 0;
}
last_state = Some(state);
last_offset = Some(input.get_offset());
}
let c_opt = input.get_char();
let is_eos = c_opt.is_none();
self.is_eos = is_eos;
let group = c_opt.and_then(|c| char_to_group(self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, c))
.unwrap_or(self.nbr_groups);
if VERBOSE { print!(", char '{}' group {}", if let Some(c) = c_opt { escape_char(c) } else { "<EOF>".to_string() }, group); }
let new_state = self.state_table[self.nbr_groups as usize * state + group as usize];
if new_state >= self.nbr_states || group >= self.nbr_groups { if let Some(c) = c_opt {
input.rewind(c).unwrap_or_else(|_| panic!("Can't rewind character '{}'", escape_char(c)));
}
let is_accepting = self.first_end_state <= state && state < self.nbr_states;
if is_accepting { let terminal = &self.terminal_table[state - self.first_end_state];
if terminal.pop {
if self.state_stack.is_empty() {
self.error = LexerError::EmptyStateStack {
info: LexerErrorInfo {
pos: self.pos,
line: self.cursor.line(),
col: self.cursor.col(),
curr_char: c_opt,
group,
state,
text: more_text + &text,
}
};
if VERBOSE { println!(" => Err({})", self.error); }
return Err(self.error.clone());
}
self.start_state = self.state_stack.pop().unwrap();
if VERBOSE { print!(", pop to {}", self.start_state); }
}
if let Some(goto_state) = terminal.mode_state {
if terminal.mode.is_push() {
self.state_stack.push(self.start_state);
}
self.start_state = goto_state;
if VERBOSE { print!(", {}({})", if terminal.mode.is_push() { "push" } else { "mode" }, goto_state); }
}
if let Some(token) = &terminal.get_token() {
if VERBOSE { println!(" => OK: token {}", token); }
return Ok(Some((*token, terminal.channel, more_text + &text, PosSpan::new(first_pos, last_pos))));
}
if !terminal.action.is_more() {
first_pos = self.cursor;
}
if !is_eos { if VERBOSE { println!(" => {}, state {}", terminal.action, self.start_state); }
state = self.start_state;
if terminal.action.is_more() {
more_text.push_str(&text);
}
text.clear();
continue;
}
}
if is_eos && is_accepting {
return Ok(None);
}
let info = LexerErrorInfo {
pos: self.pos,
line: self.cursor.line(),
col: self.cursor.col(),
curr_char: c_opt,
group,
state,
text: more_text + &text,
};
self.error = if is_eos {
LexerError::EndOfStream { info }
} else if group >= self.nbr_groups {
let c = input.get_char().unwrap(); self.update_pos(c);
LexerError::UnrecognizedChar { info }
} else {
let c = input.get_char().unwrap(); self.update_pos(c);
LexerError::InvalidChar { info }
};
if VERBOSE { println!(" => Err({})", self.error); }
return Err(self.error.clone());
} else {
last_pos = self.cursor;
if let Some(c) = c_opt {
text.push(c);
self.update_pos(c);
}
if VERBOSE { println!(" => state {new_state}"); }
state = new_state;
}
}
}
self.error = LexerError::NoStreamAttached;
if VERBOSE { println!(" => Err({})", self.error); }
Err(self.error.clone())
}
pub fn update_pos(&mut self, c: char) {
self.cursor.update_pos(c, self.tab_width);
self.pos += 1;
}
pub fn get_error(&self) -> &LexerError {
&self.error
}
pub fn has_error(&self) -> bool {
self.error != LexerError::None
}
pub fn is_eos(&self) -> bool {
self.is_eos
}
}
#[derive(Debug)]
enum LexInterpretIterMode { Normal, Error }
pub struct LexInterpretIter<'a, 'b, R> {
lexer: &'a mut Lexer<'b, R>,
error_info: Option<LexerErrorInfo>,
mode: LexInterpretIterMode
}
impl<'a, 'b, R: Read> Iterator for LexInterpretIter<'a, 'b, R> {
type Item = LexerToken;
fn next(&mut self) -> Option<Self::Item> {
if self.lexer.is_eos {
None
} else {
match self.mode {
LexInterpretIterMode::Normal => {
let t = self.lexer.get_token();
match t {
Ok(Some(token)) => Some(token),
Err(LexerError::InvalidChar { info } | LexerError::UnrecognizedChar { info }) => {
self.error_info = Some(info);
self.mode = LexInterpretIterMode::Error;
None
}
_ => {
None
}
}
}
LexInterpretIterMode::Error => {
let info = self.error_info.as_ref().unwrap();
self.mode = LexInterpretIterMode::Normal;
let msg = format!("{}, scanned before = '{}'", self.lexer.get_error(), self.error_info.as_ref().unwrap().text);
let pos = Pos(info.line, info.col);
Some((TokenId::MAX, 0, msg, PosSpan::new(pos, pos)))
}
}
}
}
}
pub struct TokenSplit<I, F> {
iter: I,
ch: ChannelId,
f: F
}
pub trait TokenSpliterator: Iterator<Item=LexerToken> {
fn split_channel0<F>(self, f: F) -> TokenSplit<Self, F>
where Self: Sized,
F: FnMut((TokenId, ChannelId, String, PosSpan))
{
TokenSplit { iter: self, ch: 0, f }
}
fn split_channels<F>(self, channel: ChannelId, f: F) -> TokenSplit<Self, F>
where Self: Sized,
F: FnMut(LexerToken)
{
TokenSplit { iter: self, ch: channel, f }
}
fn keep_channel0(self) -> impl Iterator<Item=(TokenId, String, PosSpan)>
where Self: Sized
{
self.filter_map(|(token, ch, str, pos_span)| {
if ch == 0 {
Some((token, str, pos_span))
} else {
None
}
})
}
fn keep_channel(self, channel: ChannelId) -> TokenSplit<Self, fn(LexerToken)>
where Self: Sized
{
TokenSplit { iter: self, ch: channel, f: |_| {} }
}
}
impl<I, F> Iterator for TokenSplit<I, F>
where I: Iterator<Item=LexerToken>,
F: FnMut((TokenId, ChannelId, String, PosSpan))
{
type Item = (TokenId, String, PosSpan);
fn next(&mut self) -> Option<Self::Item> {
if let Some((token, ch, str, pos_span)) = self.iter.next() {
if ch == self.ch {
Some((token, str, pos_span))
} else {
(self.f)((token, ch, str, pos_span));
None
}
} else {
None
}
}
}
impl<I: Iterator<Item=LexerToken>> TokenSpliterator for I {}