llguidance/earley/
lexer.rs

1use anyhow::Result;
2use std::fmt::Debug;
3use toktrie::SimpleVob;
4
5use crate::api::ParserLimits;
6
7use super::{
8    lexerspec::{Lexeme, LexemeIdx, LexerSpec},
9    regexvec::{LexemeSet, MatchingLexemes, NextByte, RegexVec, StateDesc},
10};
11
12const DEBUG: bool = true;
13
14macro_rules! debug {
15    ($($arg:tt)*) => {
16        if cfg!(feature = "logging") && DEBUG {
17            eprintln!($($arg)*);
18        }
19    }
20}
21
22#[derive(Clone)]
23pub struct Lexer {
24    pub(crate) dfa: RegexVec,
25    // set of bytes that are allowed in any of the lexemes
26    // this is used to fail states quickly
27    allowed_first_byte: SimpleVob,
28    spec: LexerSpec,
29}
30
31pub type StateID = derivre::StateID;
32
33/// PreLexeme contains index of the lexeme but not the bytes.
34#[derive(Debug, Clone, Copy)]
35pub struct PreLexeme {
36    pub idx: MatchingLexemesIdx,
37    pub byte: Option<u8>,
38    /// Does the 'byte' above belong to the next lexeme?
39    pub byte_next_row: bool,
40    // Length in bytes of the hidden part of the lexeme.
41    // pub hidden_len: u32,
42}
43
44impl PreLexeme {
45    pub fn just_idx(idx: MatchingLexemesIdx) -> Self {
46        PreLexeme {
47            idx,
48            byte: None,
49            byte_next_row: false,
50        }
51    }
52}
53
54#[derive(Debug)]
55pub enum LexerResult {
56    Lexeme(PreLexeme),
57    SpecialToken(StateID),
58    State(StateID, u8),
59    Error,
60}
61
62impl Lexer {
63    pub fn from(spec: &LexerSpec, limits: &mut ParserLimits, dbg: bool) -> Result<Self> {
64        let mut dfa = spec.to_regex_vec(limits)?;
65
66        if dbg {
67            debug!("lexer: {:?}\n  ==> dfa: {:?}", spec, dfa);
68        }
69
70        let s0 = dfa.initial_state(&spec.all_lexemes());
71        let mut allowed_first_byte = SimpleVob::alloc(256);
72        for i in 0..=255 {
73            if !dfa.transition(s0, i).is_dead() {
74                allowed_first_byte.allow_token(i as u32);
75            }
76        }
77
78        let lex = Lexer {
79            dfa,
80            allowed_first_byte,
81            spec: spec.clone(), // TODO check perf of Rc<> ?
82        };
83
84        Ok(lex)
85    }
86
87    pub fn lexer_spec(&self) -> &LexerSpec {
88        &self.spec
89    }
90
91    pub fn start_state(&mut self, allowed_lexemes: &LexemeSet) -> StateID {
92        self.dfa.initial_state(allowed_lexemes)
93    }
94
95    pub fn transition_start_state(&mut self, s: StateID, first_byte: Option<u8>) -> StateID {
96        first_byte.map(|b| self.dfa.transition(s, b)).unwrap_or(s)
97    }
98
99    pub fn a_dead_state(&self) -> StateID {
100        StateID::DEAD
101    }
102
103    pub fn possible_hidden_len(&mut self, state: StateID) -> usize {
104        self.dfa.possible_lookahead_len(state)
105    }
106
107    fn state_info(&self, state: StateID) -> &StateDesc {
108        self.dfa.state_desc(state)
109    }
110
111    pub fn allows_eos(&mut self, state: StateID) -> bool {
112        let l = self.spec.eos_ending_lexemes();
113        for lexeme in self.state_info(state).greedy_accepting.as_slice() {
114            if l.contains(*lexeme) {
115                return true;
116            }
117        }
118        false
119    }
120
121    pub fn limit_state_to(&mut self, state: StateID, allowed_lexemes: &LexemeSet) -> StateID {
122        self.dfa.limit_state_to(state, allowed_lexemes)
123    }
124
125    pub fn possible_lexemes(&self, state: StateID) -> &LexemeSet {
126        &self.state_info(state).possible
127    }
128
129    pub fn force_lexeme_end(&self, prev: StateID) -> LexerResult {
130        let info = self.state_info(prev);
131        match info.possible.first() {
132            Some(idx) => LexerResult::Lexeme(PreLexeme::just_idx(MatchingLexemesIdx::Single(idx))),
133            None => LexerResult::Error,
134        }
135    }
136
137    pub fn try_lexeme_end(&mut self, prev: StateID) -> LexerResult {
138        if self.state_info(prev).greedy_accepting.is_some() {
139            LexerResult::Lexeme(PreLexeme::just_idx(MatchingLexemesIdx::GreedyAccepting(
140                prev,
141            )))
142        } else {
143            LexerResult::Error
144        }
145    }
146
147    pub fn check_for_single_byte_lexeme(&mut self, state: StateID, b: u8) -> Option<PreLexeme> {
148        if self.dfa.next_byte(state) == NextByte::ForcedEOI {
149            Some(PreLexeme {
150                idx: MatchingLexemesIdx::GreedyAccepting(state),
151                byte: Some(b),
152                byte_next_row: false,
153            })
154        } else {
155            None
156        }
157    }
158
159    pub fn subsume_possible(&mut self, state: StateID) -> bool {
160        self.dfa.subsume_possible(state)
161    }
162
163    pub fn check_subsume(&mut self, state: StateID, extra_idx: usize, budget: u64) -> Result<bool> {
164        self.dfa
165            .check_subsume(state, self.spec.extra_lexeme(extra_idx), budget)
166    }
167
168    pub fn next_byte(&mut self, state: StateID) -> NextByte {
169        // there should be no transition from a state with a lazy match
170        // - it should have generated a lexeme
171        assert!(!state.has_lowest_match());
172
173        let mut forced = self.dfa.next_byte(state);
174
175        let info = self.dfa.state_desc(state);
176        if info.greedy_accepting.is_some() {
177            // with lowest accepting present, any transition to DEAD state
178            // (of which they are likely many) would generate a lexeme
179            forced = forced.make_fuzzy();
180        }
181
182        forced
183    }
184
185    #[inline(always)]
186    pub fn advance(&mut self, prev: StateID, byte: u8, enable_logging: bool) -> LexerResult {
187        let state = self.dfa.transition(prev, byte);
188
189        if enable_logging {
190            let info = self.state_info(state);
191            debug!(
192                "lex: {:?} -{:?}-> {:?}, acpt={:?}",
193                prev, byte as char, state, info.greedy_accepting
194            );
195        }
196
197        if state.is_dead() {
198            // if the left-over byte is not allowed as the first byte of any lexeme, we can fail early
199            if !self.allowed_first_byte.is_allowed(byte as u32) {
200                return LexerResult::Error;
201            }
202            let info = self.dfa.state_desc(prev);
203            // we take the first token that matched
204            // (eg., "while" will match both keyword and identifier, but keyword is first)
205            if info.greedy_accepting.is_some() {
206                LexerResult::Lexeme(PreLexeme {
207                    idx: MatchingLexemesIdx::GreedyAccepting(prev),
208                    byte: Some(byte),
209                    byte_next_row: true,
210                })
211            } else {
212                LexerResult::Error
213            }
214        } else if state.has_lowest_match() {
215            let info = self.dfa.state_desc(state);
216            assert!(info.lazy_accepting.is_some());
217            if info.has_special_token {
218                return LexerResult::SpecialToken(state);
219            }
220            LexerResult::Lexeme(PreLexeme {
221                idx: MatchingLexemesIdx::LazyAccepting(state),
222                byte: Some(byte),
223                byte_next_row: false,
224            })
225        } else {
226            LexerResult::State(state, byte)
227        }
228    }
229
230    pub fn lexemes_from_idx(&self, idx: MatchingLexemesIdx) -> &MatchingLexemes {
231        match idx {
232            MatchingLexemesIdx::Single(idx) => &self.spec.lexeme_spec(idx).single_set,
233            MatchingLexemesIdx::GreedyAccepting(state_id) => {
234                &self.dfa.state_desc(state_id).greedy_accepting
235            }
236            MatchingLexemesIdx::LazyAccepting(state_id) => {
237                &self.dfa.state_desc(state_id).lazy_accepting
238            }
239        }
240    }
241
242    #[inline(always)]
243    pub fn lexeme_props(&self, idx: MatchingLexemesIdx) -> (u32, bool) {
244        match idx {
245            MatchingLexemesIdx::Single(_) | MatchingLexemesIdx::GreedyAccepting(_) => (0, false),
246            MatchingLexemesIdx::LazyAccepting(state_id) => {
247                let info = self.dfa.state_desc(state_id);
248                let hidden = info.lazy_hidden_len;
249                if hidden > 0 {
250                    let spec = self.spec.lexeme_spec(info.lazy_accepting.first().unwrap());
251                    (hidden, spec.is_suffix)
252                } else {
253                    (0, false)
254                }
255            }
256        }
257    }
258
259    pub fn dbg_lexeme(&self, lex: &Lexeme) -> String {
260        let set = self.lexemes_from_idx(lex.idx);
261
262        // let info = &self.lexemes[lex.idx.as_usize()];
263        // if matches!(info.rx, RegexAst::Literal(_)) && lex.hidden_len == 0 {
264        //     format!("[{}]", info.name)
265        // }
266
267        format!("{:?} {:?}", lex, set)
268    }
269}
270
271impl LexerResult {
272    #[inline(always)]
273    pub fn is_error(&self) -> bool {
274        matches!(self, LexerResult::Error)
275    }
276}
277
278#[derive(Debug, Clone, Copy, PartialEq, Eq)]
279pub enum MatchingLexemesIdx {
280    Single(LexemeIdx),
281    GreedyAccepting(StateID),
282    LazyAccepting(StateID),
283}