1use anyhow::Result;
2use std::fmt::Debug;
3use toktrie::SimpleVob;
4
5use crate::api::ParserLimits;
6
7use super::{
8 lexerspec::{Lexeme, LexemeIdx, LexerSpec},
9 regexvec::{LexemeSet, MatchingLexemes, NextByte, RegexVec, StateDesc},
10};
11
12const DEBUG: bool = true;
13
14macro_rules! debug {
15 ($($arg:tt)*) => {
16 if cfg!(feature = "logging") && DEBUG {
17 eprintln!($($arg)*);
18 }
19 }
20}
21
22#[derive(Clone)]
23pub struct Lexer {
24 pub(crate) dfa: RegexVec,
25 allowed_first_byte: SimpleVob,
28 spec: LexerSpec,
29}
30
31pub type StateID = derivre::StateID;
32
33#[derive(Debug, Clone, Copy)]
35pub struct PreLexeme {
36 pub idx: MatchingLexemesIdx,
37 pub byte: Option<u8>,
38 pub byte_next_row: bool,
40 }
43
44impl PreLexeme {
45 pub fn just_idx(idx: MatchingLexemesIdx) -> Self {
46 PreLexeme {
47 idx,
48 byte: None,
49 byte_next_row: false,
50 }
51 }
52}
53
54#[derive(Debug)]
55pub enum LexerResult {
56 Lexeme(PreLexeme),
57 SpecialToken(StateID),
58 State(StateID, u8),
59 Error,
60}
61
62impl Lexer {
63 pub fn from(spec: &LexerSpec, limits: &mut ParserLimits, dbg: bool) -> Result<Self> {
64 let mut dfa = spec.to_regex_vec(limits)?;
65
66 if dbg {
67 debug!("lexer: {:?}\n ==> dfa: {:?}", spec, dfa);
68 }
69
70 let s0 = dfa.initial_state(&spec.all_lexemes());
71 let mut allowed_first_byte = SimpleVob::alloc(256);
72 for i in 0..=255 {
73 if !dfa.transition(s0, i).is_dead() {
74 allowed_first_byte.allow_token(i as u32);
75 }
76 }
77
78 let lex = Lexer {
79 dfa,
80 allowed_first_byte,
81 spec: spec.clone(), };
83
84 Ok(lex)
85 }
86
87 pub fn lexer_spec(&self) -> &LexerSpec {
88 &self.spec
89 }
90
91 pub fn start_state(&mut self, allowed_lexemes: &LexemeSet) -> StateID {
92 self.dfa.initial_state(allowed_lexemes)
93 }
94
95 pub fn transition_start_state(&mut self, s: StateID, first_byte: Option<u8>) -> StateID {
96 first_byte.map(|b| self.dfa.transition(s, b)).unwrap_or(s)
97 }
98
99 pub fn a_dead_state(&self) -> StateID {
100 StateID::DEAD
101 }
102
103 pub fn possible_hidden_len(&mut self, state: StateID) -> usize {
104 self.dfa.possible_lookahead_len(state)
105 }
106
107 fn state_info(&self, state: StateID) -> &StateDesc {
108 self.dfa.state_desc(state)
109 }
110
111 pub fn allows_eos(&mut self, state: StateID) -> bool {
112 let l = self.spec.eos_ending_lexemes();
113 for lexeme in self.state_info(state).greedy_accepting.as_slice() {
114 if l.contains(*lexeme) {
115 return true;
116 }
117 }
118 false
119 }
120
121 pub fn limit_state_to(&mut self, state: StateID, allowed_lexemes: &LexemeSet) -> StateID {
122 self.dfa.limit_state_to(state, allowed_lexemes)
123 }
124
125 pub fn possible_lexemes(&self, state: StateID) -> &LexemeSet {
126 &self.state_info(state).possible
127 }
128
129 pub fn force_lexeme_end(&self, prev: StateID) -> LexerResult {
130 let info = self.state_info(prev);
131 match info.possible.first() {
132 Some(idx) => LexerResult::Lexeme(PreLexeme::just_idx(MatchingLexemesIdx::Single(idx))),
133 None => LexerResult::Error,
134 }
135 }
136
137 pub fn try_lexeme_end(&mut self, prev: StateID) -> LexerResult {
138 if self.state_info(prev).greedy_accepting.is_some() {
139 LexerResult::Lexeme(PreLexeme::just_idx(MatchingLexemesIdx::GreedyAccepting(
140 prev,
141 )))
142 } else {
143 LexerResult::Error
144 }
145 }
146
147 pub fn check_for_single_byte_lexeme(&mut self, state: StateID, b: u8) -> Option<PreLexeme> {
148 if self.dfa.next_byte(state) == NextByte::ForcedEOI {
149 Some(PreLexeme {
150 idx: MatchingLexemesIdx::GreedyAccepting(state),
151 byte: Some(b),
152 byte_next_row: false,
153 })
154 } else {
155 None
156 }
157 }
158
159 pub fn subsume_possible(&mut self, state: StateID) -> bool {
160 self.dfa.subsume_possible(state)
161 }
162
163 pub fn check_subsume(&mut self, state: StateID, extra_idx: usize, budget: u64) -> Result<bool> {
164 self.dfa
165 .check_subsume(state, self.spec.extra_lexeme(extra_idx), budget)
166 }
167
168 pub fn next_byte(&mut self, state: StateID) -> NextByte {
169 assert!(!state.has_lowest_match());
172
173 let mut forced = self.dfa.next_byte(state);
174
175 let info = self.dfa.state_desc(state);
176 if info.greedy_accepting.is_some() {
177 forced = forced.make_fuzzy();
180 }
181
182 forced
183 }
184
185 #[inline(always)]
186 pub fn advance(&mut self, prev: StateID, byte: u8, enable_logging: bool) -> LexerResult {
187 let state = self.dfa.transition(prev, byte);
188
189 if enable_logging {
190 let info = self.state_info(state);
191 debug!(
192 "lex: {:?} -{:?}-> {:?}, acpt={:?}",
193 prev, byte as char, state, info.greedy_accepting
194 );
195 }
196
197 if state.is_dead() {
198 if !self.allowed_first_byte.is_allowed(byte as u32) {
200 return LexerResult::Error;
201 }
202 let info = self.dfa.state_desc(prev);
203 if info.greedy_accepting.is_some() {
206 LexerResult::Lexeme(PreLexeme {
207 idx: MatchingLexemesIdx::GreedyAccepting(prev),
208 byte: Some(byte),
209 byte_next_row: true,
210 })
211 } else {
212 LexerResult::Error
213 }
214 } else if state.has_lowest_match() {
215 let info = self.dfa.state_desc(state);
216 assert!(info.lazy_accepting.is_some());
217 if info.has_special_token {
218 return LexerResult::SpecialToken(state);
219 }
220 LexerResult::Lexeme(PreLexeme {
221 idx: MatchingLexemesIdx::LazyAccepting(state),
222 byte: Some(byte),
223 byte_next_row: false,
224 })
225 } else {
226 LexerResult::State(state, byte)
227 }
228 }
229
230 pub fn lexemes_from_idx(&self, idx: MatchingLexemesIdx) -> &MatchingLexemes {
231 match idx {
232 MatchingLexemesIdx::Single(idx) => &self.spec.lexeme_spec(idx).single_set,
233 MatchingLexemesIdx::GreedyAccepting(state_id) => {
234 &self.dfa.state_desc(state_id).greedy_accepting
235 }
236 MatchingLexemesIdx::LazyAccepting(state_id) => {
237 &self.dfa.state_desc(state_id).lazy_accepting
238 }
239 }
240 }
241
242 #[inline(always)]
243 pub fn lexeme_props(&self, idx: MatchingLexemesIdx) -> (u32, bool) {
244 match idx {
245 MatchingLexemesIdx::Single(_) | MatchingLexemesIdx::GreedyAccepting(_) => (0, false),
246 MatchingLexemesIdx::LazyAccepting(state_id) => {
247 let info = self.dfa.state_desc(state_id);
248 let hidden = info.lazy_hidden_len;
249 if hidden > 0 {
250 let spec = self.spec.lexeme_spec(info.lazy_accepting.first().unwrap());
251 (hidden, spec.is_suffix)
252 } else {
253 (0, false)
254 }
255 }
256 }
257 }
258
259 pub fn dbg_lexeme(&self, lex: &Lexeme) -> String {
260 let set = self.lexemes_from_idx(lex.idx);
261
262 format!("{:?} {:?}", lex, set)
268 }
269}
270
271impl LexerResult {
272 #[inline(always)]
273 pub fn is_error(&self) -> bool {
274 matches!(self, LexerResult::Error)
275 }
276}
277
278#[derive(Debug, Clone, Copy, PartialEq, Eq)]
279pub enum MatchingLexemesIdx {
280 Single(LexemeIdx),
281 GreedyAccepting(StateID),
282 LazyAccepting(StateID),
283}