Skip to main content

antlr4_runtime/
lexer.rs

1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::char_stream::{CharStream, TextInterval};
4use crate::int_stream::EOF;
5use crate::recognizer::{Recognizer, RecognizerData};
6use crate::token::{CommonToken, CommonTokenFactory, TokenFactory, TokenSourceError, TokenSpec};
7
8pub const SKIP: i32 = -3;
9pub const MORE: i32 = -2;
10pub const DEFAULT_MODE: i32 = 0;
11
12#[derive(Clone, Copy, Debug, Eq, PartialEq)]
13pub struct LexerMode(pub i32);
14
15/// Grammar-specific lexer action reached on the accepted ATN path.
16///
17/// ANTLR serializes embedded lexer actions as `(rule_index, action_index)`
18/// pairs. The runtime also records the input position where the action was
19/// reached so generated code can evaluate templates such as `Text()` at the
20/// same point as a generated ANTLR lexer, not only at the token end.
21#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub struct LexerCustomAction {
23    rule_index: i32,
24    action_index: i32,
25    position: usize,
26}
27
28impl LexerCustomAction {
29    /// Creates a custom lexer action event from serialized ATN metadata.
30    pub const fn new(rule_index: i32, action_index: i32, position: usize) -> Self {
31        Self {
32            rule_index,
33            action_index,
34            position,
35        }
36    }
37
38    /// Lexer rule index that owns the embedded action.
39    pub const fn rule_index(self) -> i32 {
40        self.rule_index
41    }
42
43    /// Per-rule action index assigned by ANTLR serialization.
44    pub const fn action_index(self) -> i32 {
45        self.action_index
46    }
47
48    /// Character-stream position at which the action transition was reached.
49    pub const fn position(self) -> usize {
50        self.position
51    }
52}
53
54/// Grammar-specific lexer predicate reached while exploring an ATN path.
55#[derive(Clone, Copy, Debug, Eq, PartialEq)]
56pub struct LexerPredicate {
57    rule_index: usize,
58    pred_index: usize,
59    position: usize,
60}
61
62impl LexerPredicate {
63    /// Creates a lexer predicate event from serialized ATN metadata.
64    pub const fn new(rule_index: usize, pred_index: usize, position: usize) -> Self {
65        Self {
66            rule_index,
67            pred_index,
68            position,
69        }
70    }
71
72    /// Lexer rule index that owns the predicate transition.
73    pub const fn rule_index(self) -> usize {
74        self.rule_index
75    }
76
77    /// Per-rule predicate index assigned by ANTLR serialization.
78    pub const fn pred_index(self) -> usize {
79        self.pred_index
80    }
81
82    /// Character-stream position at which the predicate is evaluated.
83    pub const fn position(self) -> usize {
84        self.position
85    }
86}
87
88pub trait Lexer: Recognizer {
89    fn mode(&self) -> i32;
90    fn set_mode(&mut self, mode: i32);
91    fn push_mode(&mut self, mode: i32);
92    fn pop_mode(&mut self) -> Option<i32>;
93}
94
95#[derive(Clone, Debug)]
96pub struct BaseLexer<I, F = CommonTokenFactory> {
97    input: I,
98    data: RecognizerData,
99    factory: F,
100    mode: i32,
101    mode_stack: Vec<i32>,
102    token_start: usize,
103    token_start_line: usize,
104    token_start_column: usize,
105    line: usize,
106    column: usize,
107    hit_eof: bool,
108    errors: Vec<TokenSourceError>,
109    lexer_dfa: LexerDfaTrace,
110}
111
112/// Compact observation log for the default-mode lexer DFA printed by `showDFA`
113/// runtime-suite descriptors.
114#[derive(Clone, Debug, Default)]
115struct LexerDfaTrace {
116    state_numbers: BTreeMap<LexerDfaKey, usize>,
117    accept_predictions: BTreeMap<usize, i32>,
118    edges: BTreeSet<LexerDfaEdge>,
119}
120
121impl LexerDfaTrace {
122    const fn new() -> Self {
123        Self {
124            state_numbers: BTreeMap::new(),
125            accept_predictions: BTreeMap::new(),
126            edges: BTreeSet::new(),
127        }
128    }
129}
130
131/// Normalized lexer ATN config-set identity used for observed DFA traces.
132#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
133pub(crate) struct LexerDfaKey {
134    configs: Vec<LexerDfaConfigKey>,
135}
136
137impl LexerDfaKey {
138    pub(crate) fn new(mut configs: Vec<LexerDfaConfigKey>) -> Self {
139        configs.sort_unstable();
140        Self { configs }
141    }
142}
143
144/// One lexer ATN config identity with the absolute input position removed.
145#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
146pub(crate) struct LexerDfaConfigKey {
147    state: usize,
148    alt_rule_index: Option<usize>,
149    consumed_eof: bool,
150    passed_non_greedy: bool,
151    stack: Vec<usize>,
152    actions: Vec<usize>,
153}
154
155impl LexerDfaConfigKey {
156    pub(crate) const fn new(
157        state: usize,
158        alt_rule_index: Option<usize>,
159        consumed_eof: bool,
160        passed_non_greedy: bool,
161        stack: Vec<usize>,
162        actions: Vec<usize>,
163    ) -> Self {
164        Self {
165            state,
166            alt_rule_index,
167            consumed_eof,
168            passed_non_greedy,
169            stack,
170            actions,
171        }
172    }
173}
174
175/// One printable lexer DFA edge keyed so repeated matches keep deterministic
176/// output order.
177#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
178struct LexerDfaEdge {
179    from: usize,
180    symbol: i32,
181    to: usize,
182}
183
184impl<I> BaseLexer<I>
185where
186    I: CharStream,
187{
188    /// Creates a lexer base using `CommonTokenFactory`.
189    pub const fn new(input: I, data: RecognizerData) -> Self {
190        Self::with_factory(input, data, CommonTokenFactory)
191    }
192}
193
194impl<I, F> BaseLexer<I, F>
195where
196    I: CharStream,
197    F: TokenFactory,
198{
199    /// Creates a lexer base with a custom token factory.
200    pub const fn with_factory(input: I, data: RecognizerData, factory: F) -> Self {
201        Self {
202            input,
203            data,
204            factory,
205            mode: DEFAULT_MODE,
206            mode_stack: Vec::new(),
207            token_start: 0,
208            token_start_line: 1,
209            token_start_column: 0,
210            line: 1,
211            column: 0,
212            hit_eof: false,
213            errors: Vec::new(),
214            lexer_dfa: LexerDfaTrace::new(),
215        }
216    }
217
218    pub const fn input(&self) -> &I {
219        &self.input
220    }
221
222    pub const fn input_mut(&mut self) -> &mut I {
223        &mut self.input
224    }
225
226    /// Captures the input index and source position for the token currently
227    /// being matched.
228    pub fn begin_token(&mut self) {
229        self.token_start = self.input.index();
230        self.token_start_line = self.line;
231        self.token_start_column = self.column;
232    }
233
234    /// Returns the absolute character index where the current token began.
235    pub const fn token_start(&self) -> usize {
236        self.token_start
237    }
238
239    /// Returns the source line captured at the start of the current token.
240    pub const fn token_start_line(&self) -> usize {
241        self.token_start_line
242    }
243
244    /// Returns the source column captured at the start of the current token.
245    pub const fn token_start_column(&self) -> usize {
246        self.token_start_column
247    }
248
249    /// Consumes one character from the input stream and updates lexer line and
250    /// column counters.
251    ///
252    /// The input stream is indexed by Unicode scalar values. Newline handling
253    /// follows ANTLR's default convention of incrementing the line and resetting
254    /// the column after `\n`.
255    pub fn consume_char(&mut self) {
256        let la = self.input.la(1);
257        if la == EOF {
258            return;
259        }
260        self.input.consume();
261        if char::from_u32(la.cast_unsigned()) == Some('\n') {
262            self.line += 1;
263            self.column = 0;
264        } else {
265            self.column += 1;
266        }
267    }
268
269    /// Rewinds or advances the input cursor to a token accept boundary.
270    ///
271    /// Some generated lexers intentionally accept a longer path to disambiguate
272    /// a token, then emit only the prefix and leave the suffix for the next
273    /// token. Recomputing line/column from `token_start` keeps the visible lexer
274    /// position consistent after moving the cursor backwards.
275    pub fn reset_accept_position(&mut self, index: usize) {
276        let target = index.max(self.token_start);
277        self.input.seek(self.token_start);
278        self.line = self.token_start_line;
279        self.column = self.token_start_column;
280        while self.input.index() < target && self.input.la(1) != EOF {
281            self.consume_char();
282        }
283    }
284
285    /// Builds a token spanning from the current token start to the character
286    /// before the input cursor.
287    ///
288    /// When generated or interpreted lexer code does not supply explicit text,
289    /// the base lexer captures the matched source interval so downstream token
290    /// streams and parse trees can render token text without retaining a source
291    /// pair object.
292    pub fn emit(&self, token_type: i32, channel: i32, text: Option<String>) -> CommonToken {
293        let stop = self.input.index().checked_sub(1).unwrap_or(usize::MAX);
294        self.emit_with_stop(token_type, channel, stop, text)
295    }
296
297    /// Builds a token with an explicit stop index.
298    ///
299    /// EOF-matching lexer rules do not consume a Unicode scalar value, so their
300    /// stop index can be one before the current input index. The caller passes
301    /// `usize::MAX` to represent ANTLR's `-1` stop index at empty input.
302    pub fn emit_with_stop(
303        &self,
304        token_type: i32,
305        channel: i32,
306        stop: usize,
307        text: Option<String>,
308    ) -> CommonToken {
309        let text = text.or_else(|| {
310            if stop == usize::MAX {
311                Some("<EOF>".to_owned())
312            } else {
313                Some(self.input.text(TextInterval::new(self.token_start, stop)))
314            }
315        });
316        self.factory.create(TokenSpec {
317            token_type,
318            channel,
319            start: self.token_start,
320            stop,
321            line: self.token_start_line,
322            column: self.token_start_column,
323            text,
324            source_name: self.input.source_name(),
325        })
326    }
327
328    /// Returns the current token text from the token start through the input
329    /// cursor.
330    pub fn token_text(&self) -> String {
331        self.token_text_until(self.input.index())
332    }
333
334    /// Returns the current token text from the token start through
335    /// `stop_exclusive`.
336    ///
337    /// Lexer custom actions can occur before the accepted token is complete.
338    /// The action event records the position where the transition fired, and
339    /// generated action code uses this helper to render ANTLR's `Text()`
340    /// template at that exact point.
341    pub fn token_text_until(&self, stop_exclusive: usize) -> String {
342        if stop_exclusive <= self.token_start {
343            return String::new();
344        }
345        self.input
346            .text(TextInterval::new(self.token_start, stop_exclusive - 1))
347    }
348
349    /// Computes the zero-based source column at an absolute input position
350    /// reached during prediction of the current token.
351    pub fn column_at(&self, position: usize) -> usize {
352        let mut column = self.token_start_column;
353        if position <= self.token_start {
354            return column;
355        }
356        for ch in self
357            .input
358            .text(TextInterval::new(self.token_start, position - 1))
359            .chars()
360        {
361            if ch == '\n' {
362                column = 0;
363            } else {
364                column += 1;
365            }
366        }
367        column
368    }
369
370    /// Builds the synthetic EOF token at the current input cursor.
371    pub fn eof_token(&self) -> CommonToken {
372        CommonToken::eof(
373            self.input.source_name(),
374            self.input.index(),
375            self.line,
376            self.column,
377        )
378    }
379}
380
381impl<I, F> Recognizer for BaseLexer<I, F>
382where
383    I: CharStream,
384    F: TokenFactory,
385{
386    fn data(&self) -> &RecognizerData {
387        &self.data
388    }
389
390    fn data_mut(&mut self) -> &mut RecognizerData {
391        &mut self.data
392    }
393}
394
395impl<I, F> Lexer for BaseLexer<I, F>
396where
397    I: CharStream,
398    F: TokenFactory,
399{
400    fn mode(&self) -> i32 {
401        self.mode
402    }
403
404    fn set_mode(&mut self, mode: i32) {
405        self.mode = mode;
406    }
407
408    fn push_mode(&mut self, mode: i32) {
409        self.mode_stack.push(self.mode);
410        self.mode = mode;
411    }
412
413    fn pop_mode(&mut self) -> Option<i32> {
414        let mode = self.mode_stack.pop()?;
415        self.mode = mode;
416        Some(mode)
417    }
418}
419
420impl<I, F> BaseLexer<I, F>
421where
422    I: CharStream,
423    F: TokenFactory,
424{
425    pub const fn line(&self) -> usize {
426        self.line
427    }
428
429    pub const fn column(&self) -> usize {
430        self.column
431    }
432
433    pub fn source_name(&self) -> &str {
434        self.input.source_name()
435    }
436
437    pub const fn hit_eof(&self) -> bool {
438        self.hit_eof
439    }
440
441    pub const fn set_hit_eof(&mut self, hit_eof: bool) {
442        self.hit_eof = hit_eof;
443    }
444
445    /// Buffers a lexer diagnostic until the token stream consumer is ready to
446    /// emit errors in parser-compatible order.
447    pub fn record_error(&mut self, line: usize, column: usize, message: impl Into<String>) {
448        self.errors
449            .push(TokenSourceError::new(line, column, message));
450    }
451
452    /// Returns and clears lexer diagnostics produced while fetching tokens.
453    pub fn drain_errors(&mut self) -> Vec<TokenSourceError> {
454        std::mem::take(&mut self.errors)
455    }
456
457    /// Returns the stable state number for a normalized lexer DFA config set,
458    /// creating one if this input path has not reached it before.
459    pub(crate) fn lexer_dfa_state(
460        &mut self,
461        key: LexerDfaKey,
462        accept_prediction: Option<i32>,
463    ) -> usize {
464        let next = self.lexer_dfa.state_numbers.len();
465        let state = *self.lexer_dfa.state_numbers.entry(key).or_insert(next);
466        if let Some(prediction) = accept_prediction {
467            self.lexer_dfa.accept_predictions.insert(state, prediction);
468        }
469        state
470    }
471
472    /// Records a visible lexer DFA edge unless it was already observed.
473    pub fn record_lexer_dfa_edge(&mut self, from: usize, symbol: i32, to: usize) {
474        self.lexer_dfa
475            .edges
476            .insert(LexerDfaEdge { from, symbol, to });
477    }
478
479    /// Serializes the observed default-mode lexer DFA in ANTLR's text shape.
480    pub fn lexer_dfa_string(&self) -> String {
481        let mut out = String::new();
482        for edge in &self.lexer_dfa.edges {
483            let Some(label) = lexer_dfa_edge_label(edge.symbol) else {
484                continue;
485            };
486            out.push_str(&self.lexer_dfa_state_string(edge.from));
487            out.push('-');
488            out.push_str(&label);
489            out.push_str("->");
490            out.push_str(&self.lexer_dfa_state_string(edge.to));
491            out.push('\n');
492        }
493        out
494    }
495
496    fn lexer_dfa_state_string(&self, state: usize) -> String {
497        self.lexer_dfa.accept_predictions.get(&state).map_or_else(
498            || format!("s{state}"),
499            |prediction| format!(":s{state}=>{prediction}"),
500        )
501    }
502}
503
504fn lexer_dfa_edge_label(symbol: i32) -> Option<String> {
505    char::from_u32(symbol.cast_unsigned()).map(|ch| format!("'{ch}'"))
506}