Skip to main content

antlr4_runtime/
lexer.rs

1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::char_stream::{CharStream, TextInterval};
4use crate::int_stream::EOF;
5use crate::recognizer::{Recognizer, RecognizerData};
6use crate::token::{CommonToken, CommonTokenFactory, TokenFactory, TokenSourceError, TokenSpec};
7
8pub const SKIP: i32 = -3;
9pub const MORE: i32 = -2;
10pub const DEFAULT_MODE: i32 = 0;
11
12#[derive(Clone, Copy, Debug, Eq, PartialEq)]
13pub struct LexerMode(pub i32);
14
15/// Grammar-specific lexer action reached on the accepted ATN path.
16///
17/// ANTLR serializes embedded lexer actions as `(rule_index, action_index)`
18/// pairs. The runtime also records the input position where the action was
19/// reached so generated code can evaluate templates such as `Text()` at the
20/// same point as a generated ANTLR lexer, not only at the token end.
21#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub struct LexerCustomAction {
23    rule_index: i32,
24    action_index: i32,
25    position: usize,
26}
27
28impl LexerCustomAction {
29    /// Creates a custom lexer action event from serialized ATN metadata.
30    pub const fn new(rule_index: i32, action_index: i32, position: usize) -> Self {
31        Self {
32            rule_index,
33            action_index,
34            position,
35        }
36    }
37
38    /// Lexer rule index that owns the embedded action.
39    pub const fn rule_index(self) -> i32 {
40        self.rule_index
41    }
42
43    /// Per-rule action index assigned by ANTLR serialization.
44    pub const fn action_index(self) -> i32 {
45        self.action_index
46    }
47
48    /// Character-stream position at which the action transition was reached.
49    pub const fn position(self) -> usize {
50        self.position
51    }
52}
53
54/// Grammar-specific lexer predicate reached while exploring an ATN path.
55#[derive(Clone, Copy, Debug, Eq, PartialEq)]
56pub struct LexerPredicate {
57    rule_index: usize,
58    pred_index: usize,
59    position: usize,
60}
61
62impl LexerPredicate {
63    /// Creates a lexer predicate event from serialized ATN metadata.
64    pub const fn new(rule_index: usize, pred_index: usize, position: usize) -> Self {
65        Self {
66            rule_index,
67            pred_index,
68            position,
69        }
70    }
71
72    /// Lexer rule index that owns the predicate transition.
73    pub const fn rule_index(self) -> usize {
74        self.rule_index
75    }
76
77    /// Per-rule predicate index assigned by ANTLR serialization.
78    pub const fn pred_index(self) -> usize {
79        self.pred_index
80    }
81
82    /// Character-stream position at which the predicate is evaluated.
83    pub const fn position(self) -> usize {
84        self.position
85    }
86}
87
88pub trait Lexer: Recognizer {
89    fn mode(&self) -> i32;
90    fn set_mode(&mut self, mode: i32);
91    fn push_mode(&mut self, mode: i32);
92    fn pop_mode(&mut self) -> Option<i32>;
93}
94
95#[derive(Clone, Debug)]
96pub struct BaseLexer<I, F = CommonTokenFactory> {
97    input: I,
98    data: RecognizerData,
99    factory: F,
100    mode: i32,
101    mode_stack: Vec<i32>,
102    token_start: usize,
103    token_start_line: usize,
104    token_start_column: usize,
105    line: usize,
106    column: usize,
107    hit_eof: bool,
108    errors: Vec<TokenSourceError>,
109    lexer_dfa: LexerDfaTrace,
110}
111
112/// Compact observation log for the default-mode lexer DFA printed by `showDFA`
113/// runtime-suite descriptors.
114#[derive(Clone, Debug, Default)]
115struct LexerDfaTrace {
116    state_numbers: BTreeMap<String, usize>,
117    accept_predictions: BTreeMap<usize, i32>,
118    edges: BTreeSet<LexerDfaEdge>,
119}
120
121impl LexerDfaTrace {
122    const fn new() -> Self {
123        Self {
124            state_numbers: BTreeMap::new(),
125            accept_predictions: BTreeMap::new(),
126            edges: BTreeSet::new(),
127        }
128    }
129}
130
131/// One printable lexer DFA edge keyed so repeated matches keep deterministic
132/// output order.
133#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
134struct LexerDfaEdge {
135    from: usize,
136    symbol: i32,
137    to: usize,
138}
139
140impl<I> BaseLexer<I>
141where
142    I: CharStream,
143{
144    /// Creates a lexer base using `CommonTokenFactory`.
145    pub const fn new(input: I, data: RecognizerData) -> Self {
146        Self::with_factory(input, data, CommonTokenFactory)
147    }
148}
149
150impl<I, F> BaseLexer<I, F>
151where
152    I: CharStream,
153    F: TokenFactory,
154{
155    /// Creates a lexer base with a custom token factory.
156    pub const fn with_factory(input: I, data: RecognizerData, factory: F) -> Self {
157        Self {
158            input,
159            data,
160            factory,
161            mode: DEFAULT_MODE,
162            mode_stack: Vec::new(),
163            token_start: 0,
164            token_start_line: 1,
165            token_start_column: 0,
166            line: 1,
167            column: 0,
168            hit_eof: false,
169            errors: Vec::new(),
170            lexer_dfa: LexerDfaTrace::new(),
171        }
172    }
173
174    pub const fn input(&self) -> &I {
175        &self.input
176    }
177
178    pub const fn input_mut(&mut self) -> &mut I {
179        &mut self.input
180    }
181
182    /// Captures the input index and source position for the token currently
183    /// being matched.
184    pub fn begin_token(&mut self) {
185        self.token_start = self.input.index();
186        self.token_start_line = self.line;
187        self.token_start_column = self.column;
188    }
189
190    /// Returns the absolute character index where the current token began.
191    pub const fn token_start(&self) -> usize {
192        self.token_start
193    }
194
195    /// Returns the source line captured at the start of the current token.
196    pub const fn token_start_line(&self) -> usize {
197        self.token_start_line
198    }
199
200    /// Returns the source column captured at the start of the current token.
201    pub const fn token_start_column(&self) -> usize {
202        self.token_start_column
203    }
204
205    /// Consumes one character from the input stream and updates lexer line and
206    /// column counters.
207    ///
208    /// The input stream is indexed by Unicode scalar values. Newline handling
209    /// follows ANTLR's default convention of incrementing the line and resetting
210    /// the column after `\n`.
211    pub fn consume_char(&mut self) {
212        let la = self.input.la(1);
213        if la == EOF {
214            return;
215        }
216        self.input.consume();
217        if char::from_u32(la.cast_unsigned()) == Some('\n') {
218            self.line += 1;
219            self.column = 0;
220        } else {
221            self.column += 1;
222        }
223    }
224
225    /// Rewinds or advances the input cursor to a token accept boundary.
226    ///
227    /// Some generated lexers intentionally accept a longer path to disambiguate
228    /// a token, then emit only the prefix and leave the suffix for the next
229    /// token. Recomputing line/column from `token_start` keeps the visible lexer
230    /// position consistent after moving the cursor backwards.
231    pub fn reset_accept_position(&mut self, index: usize) {
232        let target = index.max(self.token_start);
233        self.input.seek(self.token_start);
234        self.line = self.token_start_line;
235        self.column = self.token_start_column;
236        while self.input.index() < target && self.input.la(1) != EOF {
237            self.consume_char();
238        }
239    }
240
241    /// Builds a token spanning from the current token start to the character
242    /// before the input cursor.
243    ///
244    /// When generated or interpreted lexer code does not supply explicit text,
245    /// the base lexer captures the matched source interval so downstream token
246    /// streams and parse trees can render token text without retaining a source
247    /// pair object.
248    pub fn emit(&self, token_type: i32, channel: i32, text: Option<String>) -> CommonToken {
249        let stop = self.input.index().checked_sub(1).unwrap_or(usize::MAX);
250        self.emit_with_stop(token_type, channel, stop, text)
251    }
252
253    /// Builds a token with an explicit stop index.
254    ///
255    /// EOF-matching lexer rules do not consume a Unicode scalar value, so their
256    /// stop index can be one before the current input index. The caller passes
257    /// `usize::MAX` to represent ANTLR's `-1` stop index at empty input.
258    pub fn emit_with_stop(
259        &self,
260        token_type: i32,
261        channel: i32,
262        stop: usize,
263        text: Option<String>,
264    ) -> CommonToken {
265        let text = text.or_else(|| {
266            if stop == usize::MAX {
267                Some("<EOF>".to_owned())
268            } else {
269                Some(self.input.text(TextInterval::new(self.token_start, stop)))
270            }
271        });
272        self.factory.create(TokenSpec {
273            token_type,
274            channel,
275            start: self.token_start,
276            stop,
277            line: self.token_start_line,
278            column: self.token_start_column,
279            text,
280            source_name: self.input.source_name(),
281        })
282    }
283
284    /// Returns the current token text from the token start through the input
285    /// cursor.
286    pub fn token_text(&self) -> String {
287        self.token_text_until(self.input.index())
288    }
289
290    /// Returns the current token text from the token start through
291    /// `stop_exclusive`.
292    ///
293    /// Lexer custom actions can occur before the accepted token is complete.
294    /// The action event records the position where the transition fired, and
295    /// generated action code uses this helper to render ANTLR's `Text()`
296    /// template at that exact point.
297    pub fn token_text_until(&self, stop_exclusive: usize) -> String {
298        if stop_exclusive <= self.token_start {
299            return String::new();
300        }
301        self.input
302            .text(TextInterval::new(self.token_start, stop_exclusive - 1))
303    }
304
305    /// Computes the zero-based source column at an absolute input position
306    /// reached during prediction of the current token.
307    pub fn column_at(&self, position: usize) -> usize {
308        let mut column = self.token_start_column;
309        if position <= self.token_start {
310            return column;
311        }
312        for ch in self
313            .input
314            .text(TextInterval::new(self.token_start, position - 1))
315            .chars()
316        {
317            if ch == '\n' {
318                column = 0;
319            } else {
320                column += 1;
321            }
322        }
323        column
324    }
325
326    /// Builds the synthetic EOF token at the current input cursor.
327    pub fn eof_token(&self) -> CommonToken {
328        CommonToken::eof(
329            self.input.source_name(),
330            self.input.index(),
331            self.line,
332            self.column,
333        )
334    }
335}
336
337impl<I, F> Recognizer for BaseLexer<I, F>
338where
339    I: CharStream,
340    F: TokenFactory,
341{
342    fn data(&self) -> &RecognizerData {
343        &self.data
344    }
345
346    fn data_mut(&mut self) -> &mut RecognizerData {
347        &mut self.data
348    }
349}
350
351impl<I, F> Lexer for BaseLexer<I, F>
352where
353    I: CharStream,
354    F: TokenFactory,
355{
356    fn mode(&self) -> i32 {
357        self.mode
358    }
359
360    fn set_mode(&mut self, mode: i32) {
361        self.mode = mode;
362    }
363
364    fn push_mode(&mut self, mode: i32) {
365        self.mode_stack.push(self.mode);
366        self.mode = mode;
367    }
368
369    fn pop_mode(&mut self) -> Option<i32> {
370        let mode = self.mode_stack.pop()?;
371        self.mode = mode;
372        Some(mode)
373    }
374}
375
376impl<I, F> BaseLexer<I, F>
377where
378    I: CharStream,
379    F: TokenFactory,
380{
381    pub const fn line(&self) -> usize {
382        self.line
383    }
384
385    pub const fn column(&self) -> usize {
386        self.column
387    }
388
389    pub fn source_name(&self) -> &str {
390        self.input.source_name()
391    }
392
393    pub const fn hit_eof(&self) -> bool {
394        self.hit_eof
395    }
396
397    pub const fn set_hit_eof(&mut self, hit_eof: bool) {
398        self.hit_eof = hit_eof;
399    }
400
401    /// Buffers a lexer diagnostic until the token stream consumer is ready to
402    /// emit errors in parser-compatible order.
403    pub fn record_error(&mut self, line: usize, column: usize, message: impl Into<String>) {
404        self.errors
405            .push(TokenSourceError::new(line, column, message));
406    }
407
408    /// Returns and clears lexer diagnostics produced while fetching tokens.
409    pub fn drain_errors(&mut self) -> Vec<TokenSourceError> {
410        std::mem::take(&mut self.errors)
411    }
412
413    /// Returns the stable state number for a normalized lexer DFA config set,
414    /// creating one if this input path has not reached it before.
415    pub fn lexer_dfa_state(&mut self, key: String, accept_prediction: Option<i32>) -> usize {
416        let next = self.lexer_dfa.state_numbers.len();
417        let state = *self.lexer_dfa.state_numbers.entry(key).or_insert(next);
418        if let Some(prediction) = accept_prediction {
419            self.lexer_dfa.accept_predictions.insert(state, prediction);
420        }
421        state
422    }
423
424    /// Records a visible lexer DFA edge unless it was already observed.
425    pub fn record_lexer_dfa_edge(&mut self, from: usize, symbol: i32, to: usize) {
426        self.lexer_dfa
427            .edges
428            .insert(LexerDfaEdge { from, symbol, to });
429    }
430
431    /// Serializes the observed default-mode lexer DFA in ANTLR's text shape.
432    pub fn lexer_dfa_string(&self) -> String {
433        let mut out = String::new();
434        for edge in &self.lexer_dfa.edges {
435            let Some(label) = lexer_dfa_edge_label(edge.symbol) else {
436                continue;
437            };
438            out.push_str(&self.lexer_dfa_state_string(edge.from));
439            out.push('-');
440            out.push_str(&label);
441            out.push_str("->");
442            out.push_str(&self.lexer_dfa_state_string(edge.to));
443            out.push('\n');
444        }
445        out
446    }
447
448    fn lexer_dfa_state_string(&self, state: usize) -> String {
449        self.lexer_dfa.accept_predictions.get(&state).map_or_else(
450            || format!("s{state}"),
451            |prediction| format!(":s{state}=>{prediction}"),
452        )
453    }
454}
455
456fn lexer_dfa_edge_label(symbol: i32) -> Option<String> {
457    char::from_u32(symbol.cast_unsigned()).map(|ch| format!("'{ch}'"))
458}