use std::collections::{BTreeMap, BTreeSet};
use crate::char_stream::{CharStream, TextInterval};
use crate::int_stream::EOF;
use crate::recognizer::{Recognizer, RecognizerData};
use crate::token::{CommonToken, CommonTokenFactory, TokenFactory, TokenSourceError, TokenSpec};
pub const SKIP: i32 = -3;
pub const MORE: i32 = -2;
pub const DEFAULT_MODE: i32 = 0;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct LexerMode(pub i32);
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct LexerCustomAction {
rule_index: i32,
action_index: i32,
position: usize,
}
impl LexerCustomAction {
pub const fn new(rule_index: i32, action_index: i32, position: usize) -> Self {
Self {
rule_index,
action_index,
position,
}
}
pub const fn rule_index(self) -> i32 {
self.rule_index
}
pub const fn action_index(self) -> i32 {
self.action_index
}
pub const fn position(self) -> usize {
self.position
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct LexerPredicate {
rule_index: usize,
pred_index: usize,
position: usize,
}
impl LexerPredicate {
pub const fn new(rule_index: usize, pred_index: usize, position: usize) -> Self {
Self {
rule_index,
pred_index,
position,
}
}
pub const fn rule_index(self) -> usize {
self.rule_index
}
pub const fn pred_index(self) -> usize {
self.pred_index
}
pub const fn position(self) -> usize {
self.position
}
}
pub trait Lexer: Recognizer {
fn mode(&self) -> i32;
fn set_mode(&mut self, mode: i32);
fn push_mode(&mut self, mode: i32);
fn pop_mode(&mut self) -> Option<i32>;
}
#[derive(Clone, Debug)]
pub struct BaseLexer<I, F = CommonTokenFactory> {
input: I,
data: RecognizerData,
factory: F,
mode: i32,
mode_stack: Vec<i32>,
token_start: usize,
token_start_line: usize,
token_start_column: usize,
line: usize,
column: usize,
hit_eof: bool,
errors: Vec<TokenSourceError>,
lexer_dfa: LexerDfaTrace,
}
#[derive(Clone, Debug, Default)]
struct LexerDfaTrace {
state_numbers: BTreeMap<LexerDfaKey, usize>,
accept_predictions: BTreeMap<usize, i32>,
edges: BTreeSet<LexerDfaEdge>,
}
impl LexerDfaTrace {
const fn new() -> Self {
Self {
state_numbers: BTreeMap::new(),
accept_predictions: BTreeMap::new(),
edges: BTreeSet::new(),
}
}
}
#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
pub(crate) struct LexerDfaKey {
configs: Vec<LexerDfaConfigKey>,
}
impl LexerDfaKey {
pub(crate) fn new(mut configs: Vec<LexerDfaConfigKey>) -> Self {
configs.sort_unstable();
Self { configs }
}
}
#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
pub(crate) struct LexerDfaConfigKey {
state: usize,
alt_rule_index: Option<usize>,
consumed_eof: bool,
passed_non_greedy: bool,
stack: Vec<usize>,
actions: Vec<usize>,
}
impl LexerDfaConfigKey {
pub(crate) const fn new(
state: usize,
alt_rule_index: Option<usize>,
consumed_eof: bool,
passed_non_greedy: bool,
stack: Vec<usize>,
actions: Vec<usize>,
) -> Self {
Self {
state,
alt_rule_index,
consumed_eof,
passed_non_greedy,
stack,
actions,
}
}
}
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
struct LexerDfaEdge {
from: usize,
symbol: i32,
to: usize,
}
impl<I> BaseLexer<I>
where
I: CharStream,
{
pub const fn new(input: I, data: RecognizerData) -> Self {
Self::with_factory(input, data, CommonTokenFactory)
}
}
impl<I, F> BaseLexer<I, F>
where
I: CharStream,
F: TokenFactory,
{
pub const fn with_factory(input: I, data: RecognizerData, factory: F) -> Self {
Self {
input,
data,
factory,
mode: DEFAULT_MODE,
mode_stack: Vec::new(),
token_start: 0,
token_start_line: 1,
token_start_column: 0,
line: 1,
column: 0,
hit_eof: false,
errors: Vec::new(),
lexer_dfa: LexerDfaTrace::new(),
}
}
pub const fn input(&self) -> &I {
&self.input
}
pub const fn input_mut(&mut self) -> &mut I {
&mut self.input
}
pub fn begin_token(&mut self) {
self.token_start = self.input.index();
self.token_start_line = self.line;
self.token_start_column = self.column;
}
pub const fn token_start(&self) -> usize {
self.token_start
}
pub const fn token_start_line(&self) -> usize {
self.token_start_line
}
pub const fn token_start_column(&self) -> usize {
self.token_start_column
}
pub fn consume_char(&mut self) {
let la = self.input.la(1);
if la == EOF {
return;
}
self.input.consume();
if char::from_u32(la.cast_unsigned()) == Some('\n') {
self.line += 1;
self.column = 0;
} else {
self.column += 1;
}
}
pub fn reset_accept_position(&mut self, index: usize) {
let target = index.max(self.token_start);
self.input.seek(self.token_start);
self.line = self.token_start_line;
self.column = self.token_start_column;
while self.input.index() < target && self.input.la(1) != EOF {
self.consume_char();
}
}
pub fn emit(&self, token_type: i32, channel: i32, text: Option<String>) -> CommonToken {
let stop = self.input.index().checked_sub(1).unwrap_or(usize::MAX);
self.emit_with_stop(token_type, channel, stop, text)
}
pub fn emit_with_stop(
&self,
token_type: i32,
channel: i32,
stop: usize,
text: Option<String>,
) -> CommonToken {
let text = text.or_else(|| {
if stop == usize::MAX {
Some("<EOF>".to_owned())
} else {
Some(self.input.text(TextInterval::new(self.token_start, stop)))
}
});
self.factory.create(TokenSpec {
token_type,
channel,
start: self.token_start,
stop,
line: self.token_start_line,
column: self.token_start_column,
text,
source_name: self.input.source_name(),
})
}
pub fn token_text(&self) -> String {
self.token_text_until(self.input.index())
}
pub fn token_text_until(&self, stop_exclusive: usize) -> String {
if stop_exclusive <= self.token_start {
return String::new();
}
self.input
.text(TextInterval::new(self.token_start, stop_exclusive - 1))
}
pub fn column_at(&self, position: usize) -> usize {
let mut column = self.token_start_column;
if position <= self.token_start {
return column;
}
for ch in self
.input
.text(TextInterval::new(self.token_start, position - 1))
.chars()
{
if ch == '\n' {
column = 0;
} else {
column += 1;
}
}
column
}
pub fn eof_token(&self) -> CommonToken {
CommonToken::eof(
self.input.source_name(),
self.input.index(),
self.line,
self.column,
)
}
}
impl<I, F> Recognizer for BaseLexer<I, F>
where
I: CharStream,
F: TokenFactory,
{
fn data(&self) -> &RecognizerData {
&self.data
}
fn data_mut(&mut self) -> &mut RecognizerData {
&mut self.data
}
}
impl<I, F> Lexer for BaseLexer<I, F>
where
I: CharStream,
F: TokenFactory,
{
fn mode(&self) -> i32 {
self.mode
}
fn set_mode(&mut self, mode: i32) {
self.mode = mode;
}
fn push_mode(&mut self, mode: i32) {
self.mode_stack.push(self.mode);
self.mode = mode;
}
fn pop_mode(&mut self) -> Option<i32> {
let mode = self.mode_stack.pop()?;
self.mode = mode;
Some(mode)
}
}
impl<I, F> BaseLexer<I, F>
where
I: CharStream,
F: TokenFactory,
{
pub const fn line(&self) -> usize {
self.line
}
pub const fn column(&self) -> usize {
self.column
}
pub fn source_name(&self) -> &str {
self.input.source_name()
}
pub const fn hit_eof(&self) -> bool {
self.hit_eof
}
pub const fn set_hit_eof(&mut self, hit_eof: bool) {
self.hit_eof = hit_eof;
}
pub fn record_error(&mut self, line: usize, column: usize, message: impl Into<String>) {
self.errors
.push(TokenSourceError::new(line, column, message));
}
pub fn drain_errors(&mut self) -> Vec<TokenSourceError> {
std::mem::take(&mut self.errors)
}
pub(crate) fn lexer_dfa_state(
&mut self,
key: LexerDfaKey,
accept_prediction: Option<i32>,
) -> usize {
let next = self.lexer_dfa.state_numbers.len();
let state = *self.lexer_dfa.state_numbers.entry(key).or_insert(next);
if let Some(prediction) = accept_prediction {
self.lexer_dfa.accept_predictions.insert(state, prediction);
}
state
}
pub fn record_lexer_dfa_edge(&mut self, from: usize, symbol: i32, to: usize) {
self.lexer_dfa
.edges
.insert(LexerDfaEdge { from, symbol, to });
}
pub fn lexer_dfa_string(&self) -> String {
let mut out = String::new();
for edge in &self.lexer_dfa.edges {
let Some(label) = lexer_dfa_edge_label(edge.symbol) else {
continue;
};
out.push_str(&self.lexer_dfa_state_string(edge.from));
out.push('-');
out.push_str(&label);
out.push_str("->");
out.push_str(&self.lexer_dfa_state_string(edge.to));
out.push('\n');
}
out
}
fn lexer_dfa_state_string(&self, state: usize) -> String {
self.lexer_dfa.accept_predictions.get(&state).map_or_else(
|| format!("s{state}"),
|prediction| format!(":s{state}=>{prediction}"),
)
}
}
fn lexer_dfa_edge_label(symbol: i32) -> Option<String> {
char::from_u32(symbol.cast_unsigned()).map(|ch| format!("'{ch}'"))
}