1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::char_stream::{CharStream, TextInterval};
4use crate::int_stream::EOF;
5use crate::recognizer::{Recognizer, RecognizerData};
6use crate::token::{CommonToken, CommonTokenFactory, TokenFactory, TokenSourceError, TokenSpec};
7
8pub const SKIP: i32 = -3;
9pub const MORE: i32 = -2;
10pub const DEFAULT_MODE: i32 = 0;
11
12#[derive(Clone, Copy, Debug, Eq, PartialEq)]
13pub struct LexerMode(pub i32);
14
15#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub struct LexerCustomAction {
23 rule_index: i32,
24 action_index: i32,
25 position: usize,
26}
27
28impl LexerCustomAction {
29 pub const fn new(rule_index: i32, action_index: i32, position: usize) -> Self {
31 Self {
32 rule_index,
33 action_index,
34 position,
35 }
36 }
37
38 pub const fn rule_index(self) -> i32 {
40 self.rule_index
41 }
42
43 pub const fn action_index(self) -> i32 {
45 self.action_index
46 }
47
48 pub const fn position(self) -> usize {
50 self.position
51 }
52}
53
54#[derive(Clone, Copy, Debug, Eq, PartialEq)]
56pub struct LexerPredicate {
57 rule_index: usize,
58 pred_index: usize,
59 position: usize,
60}
61
62impl LexerPredicate {
63 pub const fn new(rule_index: usize, pred_index: usize, position: usize) -> Self {
65 Self {
66 rule_index,
67 pred_index,
68 position,
69 }
70 }
71
72 pub const fn rule_index(self) -> usize {
74 self.rule_index
75 }
76
77 pub const fn pred_index(self) -> usize {
79 self.pred_index
80 }
81
82 pub const fn position(self) -> usize {
84 self.position
85 }
86}
87
88pub trait Lexer: Recognizer {
89 fn mode(&self) -> i32;
90 fn set_mode(&mut self, mode: i32);
91 fn push_mode(&mut self, mode: i32);
92 fn pop_mode(&mut self) -> Option<i32>;
93}
94
95#[derive(Clone, Debug)]
96pub struct BaseLexer<I, F = CommonTokenFactory> {
97 input: I,
98 data: RecognizerData,
99 factory: F,
100 mode: i32,
101 mode_stack: Vec<i32>,
102 token_start: usize,
103 token_start_line: usize,
104 token_start_column: usize,
105 line: usize,
106 column: usize,
107 hit_eof: bool,
108 errors: Vec<TokenSourceError>,
109 lexer_dfa: LexerDfaTrace,
110}
111
112#[derive(Clone, Debug, Default)]
115struct LexerDfaTrace {
116 state_numbers: BTreeMap<LexerDfaKey, usize>,
117 accept_predictions: BTreeMap<usize, i32>,
118 edges: BTreeSet<LexerDfaEdge>,
119}
120
121impl LexerDfaTrace {
122 const fn new() -> Self {
123 Self {
124 state_numbers: BTreeMap::new(),
125 accept_predictions: BTreeMap::new(),
126 edges: BTreeSet::new(),
127 }
128 }
129}
130
131#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
133pub(crate) struct LexerDfaKey {
134 configs: Vec<LexerDfaConfigKey>,
135}
136
137impl LexerDfaKey {
138 pub(crate) fn new(mut configs: Vec<LexerDfaConfigKey>) -> Self {
139 configs.sort_unstable();
140 Self { configs }
141 }
142}
143
144#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
146pub(crate) struct LexerDfaConfigKey {
147 state: usize,
148 alt_rule_index: Option<usize>,
149 consumed_eof: bool,
150 passed_non_greedy: bool,
151 stack: Vec<usize>,
152 actions: Vec<usize>,
153}
154
155impl LexerDfaConfigKey {
156 pub(crate) const fn new(
157 state: usize,
158 alt_rule_index: Option<usize>,
159 consumed_eof: bool,
160 passed_non_greedy: bool,
161 stack: Vec<usize>,
162 actions: Vec<usize>,
163 ) -> Self {
164 Self {
165 state,
166 alt_rule_index,
167 consumed_eof,
168 passed_non_greedy,
169 stack,
170 actions,
171 }
172 }
173}
174
175#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
178struct LexerDfaEdge {
179 from: usize,
180 symbol: i32,
181 to: usize,
182}
183
184impl<I> BaseLexer<I>
185where
186 I: CharStream,
187{
188 pub const fn new(input: I, data: RecognizerData) -> Self {
190 Self::with_factory(input, data, CommonTokenFactory)
191 }
192}
193
194impl<I, F> BaseLexer<I, F>
195where
196 I: CharStream,
197 F: TokenFactory,
198{
199 pub const fn with_factory(input: I, data: RecognizerData, factory: F) -> Self {
201 Self {
202 input,
203 data,
204 factory,
205 mode: DEFAULT_MODE,
206 mode_stack: Vec::new(),
207 token_start: 0,
208 token_start_line: 1,
209 token_start_column: 0,
210 line: 1,
211 column: 0,
212 hit_eof: false,
213 errors: Vec::new(),
214 lexer_dfa: LexerDfaTrace::new(),
215 }
216 }
217
218 pub const fn input(&self) -> &I {
219 &self.input
220 }
221
222 pub const fn input_mut(&mut self) -> &mut I {
223 &mut self.input
224 }
225
226 pub fn begin_token(&mut self) {
229 self.token_start = self.input.index();
230 self.token_start_line = self.line;
231 self.token_start_column = self.column;
232 }
233
234 pub const fn token_start(&self) -> usize {
236 self.token_start
237 }
238
239 pub const fn token_start_line(&self) -> usize {
241 self.token_start_line
242 }
243
244 pub const fn token_start_column(&self) -> usize {
246 self.token_start_column
247 }
248
249 pub fn consume_char(&mut self) {
256 let la = self.input.la(1);
257 if la == EOF {
258 return;
259 }
260 self.input.consume();
261 if char::from_u32(la.cast_unsigned()) == Some('\n') {
262 self.line += 1;
263 self.column = 0;
264 } else {
265 self.column += 1;
266 }
267 }
268
269 pub fn reset_accept_position(&mut self, index: usize) {
276 let target = index.max(self.token_start);
277 self.input.seek(self.token_start);
278 self.line = self.token_start_line;
279 self.column = self.token_start_column;
280 while self.input.index() < target && self.input.la(1) != EOF {
281 self.consume_char();
282 }
283 }
284
285 pub fn emit(&self, token_type: i32, channel: i32, text: Option<String>) -> CommonToken {
293 let stop = self.input.index().checked_sub(1).unwrap_or(usize::MAX);
294 self.emit_with_stop(token_type, channel, stop, text)
295 }
296
297 pub fn emit_with_stop(
303 &self,
304 token_type: i32,
305 channel: i32,
306 stop: usize,
307 text: Option<String>,
308 ) -> CommonToken {
309 let text = text.or_else(|| {
310 if stop == usize::MAX {
311 Some("<EOF>".to_owned())
312 } else {
313 Some(self.input.text(TextInterval::new(self.token_start, stop)))
314 }
315 });
316 self.factory.create(TokenSpec {
317 token_type,
318 channel,
319 start: self.token_start,
320 stop,
321 line: self.token_start_line,
322 column: self.token_start_column,
323 text,
324 source_name: self.input.source_name(),
325 })
326 }
327
328 pub fn token_text(&self) -> String {
331 self.token_text_until(self.input.index())
332 }
333
334 pub fn token_text_until(&self, stop_exclusive: usize) -> String {
342 if stop_exclusive <= self.token_start {
343 return String::new();
344 }
345 self.input
346 .text(TextInterval::new(self.token_start, stop_exclusive - 1))
347 }
348
349 pub fn column_at(&self, position: usize) -> usize {
352 let mut column = self.token_start_column;
353 if position <= self.token_start {
354 return column;
355 }
356 for ch in self
357 .input
358 .text(TextInterval::new(self.token_start, position - 1))
359 .chars()
360 {
361 if ch == '\n' {
362 column = 0;
363 } else {
364 column += 1;
365 }
366 }
367 column
368 }
369
370 pub fn eof_token(&self) -> CommonToken {
372 CommonToken::eof(
373 self.input.source_name(),
374 self.input.index(),
375 self.line,
376 self.column,
377 )
378 }
379}
380
381impl<I, F> Recognizer for BaseLexer<I, F>
382where
383 I: CharStream,
384 F: TokenFactory,
385{
386 fn data(&self) -> &RecognizerData {
387 &self.data
388 }
389
390 fn data_mut(&mut self) -> &mut RecognizerData {
391 &mut self.data
392 }
393}
394
395impl<I, F> Lexer for BaseLexer<I, F>
396where
397 I: CharStream,
398 F: TokenFactory,
399{
400 fn mode(&self) -> i32 {
401 self.mode
402 }
403
404 fn set_mode(&mut self, mode: i32) {
405 self.mode = mode;
406 }
407
408 fn push_mode(&mut self, mode: i32) {
409 self.mode_stack.push(self.mode);
410 self.mode = mode;
411 }
412
413 fn pop_mode(&mut self) -> Option<i32> {
414 let mode = self.mode_stack.pop()?;
415 self.mode = mode;
416 Some(mode)
417 }
418}
419
420impl<I, F> BaseLexer<I, F>
421where
422 I: CharStream,
423 F: TokenFactory,
424{
425 pub const fn line(&self) -> usize {
426 self.line
427 }
428
429 pub const fn column(&self) -> usize {
430 self.column
431 }
432
433 pub fn source_name(&self) -> &str {
434 self.input.source_name()
435 }
436
437 pub const fn hit_eof(&self) -> bool {
438 self.hit_eof
439 }
440
441 pub const fn set_hit_eof(&mut self, hit_eof: bool) {
442 self.hit_eof = hit_eof;
443 }
444
445 pub fn record_error(&mut self, line: usize, column: usize, message: impl Into<String>) {
448 self.errors
449 .push(TokenSourceError::new(line, column, message));
450 }
451
452 pub fn drain_errors(&mut self) -> Vec<TokenSourceError> {
454 std::mem::take(&mut self.errors)
455 }
456
457 pub(crate) fn lexer_dfa_state(
460 &mut self,
461 key: LexerDfaKey,
462 accept_prediction: Option<i32>,
463 ) -> usize {
464 let next = self.lexer_dfa.state_numbers.len();
465 let state = *self.lexer_dfa.state_numbers.entry(key).or_insert(next);
466 if let Some(prediction) = accept_prediction {
467 self.lexer_dfa.accept_predictions.insert(state, prediction);
468 }
469 state
470 }
471
472 pub fn record_lexer_dfa_edge(&mut self, from: usize, symbol: i32, to: usize) {
474 self.lexer_dfa
475 .edges
476 .insert(LexerDfaEdge { from, symbol, to });
477 }
478
479 pub fn lexer_dfa_string(&self) -> String {
481 let mut out = String::new();
482 for edge in &self.lexer_dfa.edges {
483 let Some(label) = lexer_dfa_edge_label(edge.symbol) else {
484 continue;
485 };
486 out.push_str(&self.lexer_dfa_state_string(edge.from));
487 out.push('-');
488 out.push_str(&label);
489 out.push_str("->");
490 out.push_str(&self.lexer_dfa_state_string(edge.to));
491 out.push('\n');
492 }
493 out
494 }
495
496 fn lexer_dfa_state_string(&self, state: usize) -> String {
497 self.lexer_dfa.accept_predictions.get(&state).map_or_else(
498 || format!("s{state}"),
499 |prediction| format!(":s{state}=>{prediction}"),
500 )
501 }
502}
503
504fn lexer_dfa_edge_label(symbol: i32) -> Option<String> {
505 char::from_u32(symbol.cast_unsigned()).map(|ch| format!("'{ch}'"))
506}