1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::char_stream::{CharStream, TextInterval};
4use crate::int_stream::EOF;
5use crate::recognizer::{Recognizer, RecognizerData};
6use crate::token::{CommonToken, CommonTokenFactory, TokenFactory, TokenSourceError, TokenSpec};
7
8pub const SKIP: i32 = -3;
9pub const MORE: i32 = -2;
10pub const DEFAULT_MODE: i32 = 0;
11
12#[derive(Clone, Copy, Debug, Eq, PartialEq)]
13pub struct LexerMode(pub i32);
14
15#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub struct LexerCustomAction {
23 rule_index: i32,
24 action_index: i32,
25 position: usize,
26}
27
28impl LexerCustomAction {
29 pub const fn new(rule_index: i32, action_index: i32, position: usize) -> Self {
31 Self {
32 rule_index,
33 action_index,
34 position,
35 }
36 }
37
38 pub const fn rule_index(self) -> i32 {
40 self.rule_index
41 }
42
43 pub const fn action_index(self) -> i32 {
45 self.action_index
46 }
47
48 pub const fn position(self) -> usize {
50 self.position
51 }
52}
53
54#[derive(Clone, Copy, Debug, Eq, PartialEq)]
56pub struct LexerPredicate {
57 rule_index: usize,
58 pred_index: usize,
59 position: usize,
60}
61
62impl LexerPredicate {
63 pub const fn new(rule_index: usize, pred_index: usize, position: usize) -> Self {
65 Self {
66 rule_index,
67 pred_index,
68 position,
69 }
70 }
71
72 pub const fn rule_index(self) -> usize {
74 self.rule_index
75 }
76
77 pub const fn pred_index(self) -> usize {
79 self.pred_index
80 }
81
82 pub const fn position(self) -> usize {
84 self.position
85 }
86}
87
88pub trait Lexer: Recognizer {
89 fn mode(&self) -> i32;
90 fn set_mode(&mut self, mode: i32);
91 fn push_mode(&mut self, mode: i32);
92 fn pop_mode(&mut self) -> Option<i32>;
93}
94
95#[derive(Clone, Debug)]
96pub struct BaseLexer<I, F = CommonTokenFactory> {
97 input: I,
98 data: RecognizerData,
99 factory: F,
100 mode: i32,
101 mode_stack: Vec<i32>,
102 token_start: usize,
103 token_start_line: usize,
104 token_start_column: usize,
105 line: usize,
106 column: usize,
107 hit_eof: bool,
108 errors: Vec<TokenSourceError>,
109 lexer_dfa: LexerDfaTrace,
110}
111
112#[derive(Clone, Debug, Default)]
115struct LexerDfaTrace {
116 state_numbers: BTreeMap<String, usize>,
117 accept_predictions: BTreeMap<usize, i32>,
118 edges: BTreeSet<LexerDfaEdge>,
119}
120
121impl LexerDfaTrace {
122 const fn new() -> Self {
123 Self {
124 state_numbers: BTreeMap::new(),
125 accept_predictions: BTreeMap::new(),
126 edges: BTreeSet::new(),
127 }
128 }
129}
130
131#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
134struct LexerDfaEdge {
135 from: usize,
136 symbol: i32,
137 to: usize,
138}
139
140impl<I> BaseLexer<I>
141where
142 I: CharStream,
143{
144 pub const fn new(input: I, data: RecognizerData) -> Self {
146 Self::with_factory(input, data, CommonTokenFactory)
147 }
148}
149
150impl<I, F> BaseLexer<I, F>
151where
152 I: CharStream,
153 F: TokenFactory,
154{
155 pub const fn with_factory(input: I, data: RecognizerData, factory: F) -> Self {
157 Self {
158 input,
159 data,
160 factory,
161 mode: DEFAULT_MODE,
162 mode_stack: Vec::new(),
163 token_start: 0,
164 token_start_line: 1,
165 token_start_column: 0,
166 line: 1,
167 column: 0,
168 hit_eof: false,
169 errors: Vec::new(),
170 lexer_dfa: LexerDfaTrace::new(),
171 }
172 }
173
174 pub const fn input(&self) -> &I {
175 &self.input
176 }
177
178 pub const fn input_mut(&mut self) -> &mut I {
179 &mut self.input
180 }
181
182 pub fn begin_token(&mut self) {
185 self.token_start = self.input.index();
186 self.token_start_line = self.line;
187 self.token_start_column = self.column;
188 }
189
190 pub const fn token_start(&self) -> usize {
192 self.token_start
193 }
194
195 pub const fn token_start_line(&self) -> usize {
197 self.token_start_line
198 }
199
200 pub const fn token_start_column(&self) -> usize {
202 self.token_start_column
203 }
204
205 pub fn consume_char(&mut self) {
212 let la = self.input.la(1);
213 if la == EOF {
214 return;
215 }
216 self.input.consume();
217 if char::from_u32(la.cast_unsigned()) == Some('\n') {
218 self.line += 1;
219 self.column = 0;
220 } else {
221 self.column += 1;
222 }
223 }
224
225 pub fn reset_accept_position(&mut self, index: usize) {
232 let target = index.max(self.token_start);
233 self.input.seek(self.token_start);
234 self.line = self.token_start_line;
235 self.column = self.token_start_column;
236 while self.input.index() < target && self.input.la(1) != EOF {
237 self.consume_char();
238 }
239 }
240
241 pub fn emit(&self, token_type: i32, channel: i32, text: Option<String>) -> CommonToken {
249 let stop = self.input.index().checked_sub(1).unwrap_or(usize::MAX);
250 self.emit_with_stop(token_type, channel, stop, text)
251 }
252
253 pub fn emit_with_stop(
259 &self,
260 token_type: i32,
261 channel: i32,
262 stop: usize,
263 text: Option<String>,
264 ) -> CommonToken {
265 let text = text.or_else(|| {
266 if stop == usize::MAX {
267 Some("<EOF>".to_owned())
268 } else {
269 Some(self.input.text(TextInterval::new(self.token_start, stop)))
270 }
271 });
272 self.factory.create(TokenSpec {
273 token_type,
274 channel,
275 start: self.token_start,
276 stop,
277 line: self.token_start_line,
278 column: self.token_start_column,
279 text,
280 source_name: self.input.source_name(),
281 })
282 }
283
284 pub fn token_text(&self) -> String {
287 self.token_text_until(self.input.index())
288 }
289
290 pub fn token_text_until(&self, stop_exclusive: usize) -> String {
298 if stop_exclusive <= self.token_start {
299 return String::new();
300 }
301 self.input
302 .text(TextInterval::new(self.token_start, stop_exclusive - 1))
303 }
304
305 pub fn column_at(&self, position: usize) -> usize {
308 let mut column = self.token_start_column;
309 if position <= self.token_start {
310 return column;
311 }
312 for ch in self
313 .input
314 .text(TextInterval::new(self.token_start, position - 1))
315 .chars()
316 {
317 if ch == '\n' {
318 column = 0;
319 } else {
320 column += 1;
321 }
322 }
323 column
324 }
325
326 pub fn eof_token(&self) -> CommonToken {
328 CommonToken::eof(
329 self.input.source_name(),
330 self.input.index(),
331 self.line,
332 self.column,
333 )
334 }
335}
336
337impl<I, F> Recognizer for BaseLexer<I, F>
338where
339 I: CharStream,
340 F: TokenFactory,
341{
342 fn data(&self) -> &RecognizerData {
343 &self.data
344 }
345
346 fn data_mut(&mut self) -> &mut RecognizerData {
347 &mut self.data
348 }
349}
350
351impl<I, F> Lexer for BaseLexer<I, F>
352where
353 I: CharStream,
354 F: TokenFactory,
355{
356 fn mode(&self) -> i32 {
357 self.mode
358 }
359
360 fn set_mode(&mut self, mode: i32) {
361 self.mode = mode;
362 }
363
364 fn push_mode(&mut self, mode: i32) {
365 self.mode_stack.push(self.mode);
366 self.mode = mode;
367 }
368
369 fn pop_mode(&mut self) -> Option<i32> {
370 let mode = self.mode_stack.pop()?;
371 self.mode = mode;
372 Some(mode)
373 }
374}
375
376impl<I, F> BaseLexer<I, F>
377where
378 I: CharStream,
379 F: TokenFactory,
380{
381 pub const fn line(&self) -> usize {
382 self.line
383 }
384
385 pub const fn column(&self) -> usize {
386 self.column
387 }
388
389 pub fn source_name(&self) -> &str {
390 self.input.source_name()
391 }
392
393 pub const fn hit_eof(&self) -> bool {
394 self.hit_eof
395 }
396
397 pub const fn set_hit_eof(&mut self, hit_eof: bool) {
398 self.hit_eof = hit_eof;
399 }
400
401 pub fn record_error(&mut self, line: usize, column: usize, message: impl Into<String>) {
404 self.errors
405 .push(TokenSourceError::new(line, column, message));
406 }
407
408 pub fn drain_errors(&mut self) -> Vec<TokenSourceError> {
410 std::mem::take(&mut self.errors)
411 }
412
413 pub fn lexer_dfa_state(&mut self, key: String, accept_prediction: Option<i32>) -> usize {
416 let next = self.lexer_dfa.state_numbers.len();
417 let state = *self.lexer_dfa.state_numbers.entry(key).or_insert(next);
418 if let Some(prediction) = accept_prediction {
419 self.lexer_dfa.accept_predictions.insert(state, prediction);
420 }
421 state
422 }
423
424 pub fn record_lexer_dfa_edge(&mut self, from: usize, symbol: i32, to: usize) {
426 self.lexer_dfa
427 .edges
428 .insert(LexerDfaEdge { from, symbol, to });
429 }
430
431 pub fn lexer_dfa_string(&self) -> String {
433 let mut out = String::new();
434 for edge in &self.lexer_dfa.edges {
435 let Some(label) = lexer_dfa_edge_label(edge.symbol) else {
436 continue;
437 };
438 out.push_str(&self.lexer_dfa_state_string(edge.from));
439 out.push('-');
440 out.push_str(&label);
441 out.push_str("->");
442 out.push_str(&self.lexer_dfa_state_string(edge.to));
443 out.push('\n');
444 }
445 out
446 }
447
448 fn lexer_dfa_state_string(&self, state: usize) -> String {
449 self.lexer_dfa.accept_predictions.get(&state).map_or_else(
450 || format!("s{state}"),
451 |prediction| format!(":s{state}=>{prediction}"),
452 )
453 }
454}
455
456fn lexer_dfa_edge_label(symbol: i32) -> Option<String> {
457 char::from_u32(symbol.cast_unsigned()).map(|ch| format!("'{ch}'"))
458}