Skip to main content

antlr4_runtime/atn/
mod.rs

1//! Abstract Transition Network structures shared by generated lexers and
2//! parsers.
3//!
4//! ANTLR serializes grammars into an ATN. Generated Rust code stores that
5//! serialized data in static metadata, while the runtime deserializes it into
6//! these compact Rust structures for simulation.
7
8pub mod lexer;
9pub mod serialized;
10
11/// Distinguishes lexer ATNs from parser ATNs in serialized grammar metadata.
12#[derive(Clone, Copy, Debug, Eq, PartialEq)]
13pub enum AtnType {
14    Lexer,
15    Parser,
16}
17
18/// Deserialized ANTLR Abstract Transition Network.
19///
20/// The structure keeps the state graph plus ANTLR side tables such as
21/// rule-to-start, rule-to-token, mode-to-start, decisions, and lexer actions.
22/// The side tables are part of the runtime contract because generated grammars
23/// should only need to provide metadata; simulation stays in this crate.
24#[derive(Clone, Debug, Eq, PartialEq)]
25pub struct Atn {
26    grammar_type: AtnType,
27    max_token_type: i32,
28    states: Vec<AtnState>,
29    rule_to_start_state: Vec<usize>,
30    rule_to_stop_state: Vec<usize>,
31    rule_to_token_type: Vec<i32>,
32    mode_to_start_state: Vec<usize>,
33    decision_to_state: Vec<usize>,
34    lexer_actions: Vec<LexerAction>,
35}
36
37impl Atn {
38    /// Creates an empty ATN with the grammar kind and maximum token type read
39    /// from the serialized header.
40    pub const fn new(grammar_type: AtnType, max_token_type: i32) -> Self {
41        Self {
42            grammar_type,
43            max_token_type,
44            states: Vec::new(),
45            rule_to_start_state: Vec::new(),
46            rule_to_stop_state: Vec::new(),
47            rule_to_token_type: Vec::new(),
48            mode_to_start_state: Vec::new(),
49            decision_to_state: Vec::new(),
50            lexer_actions: Vec::new(),
51        }
52    }
53
54    pub const fn grammar_type(&self) -> AtnType {
55        self.grammar_type
56    }
57
58    pub const fn max_token_type(&self) -> i32 {
59        self.max_token_type
60    }
61
62    pub fn states(&self) -> &[AtnState] {
63        &self.states
64    }
65
66    pub fn state(&self, state_number: usize) -> Option<&AtnState> {
67        self.states.get(state_number)
68    }
69
70    pub fn state_mut(&mut self, state_number: usize) -> Option<&mut AtnState> {
71        self.states.get_mut(state_number)
72    }
73
74    /// Appends a state and returns the state number assigned by insertion
75    /// order.
76    pub fn add_state(&mut self, state: AtnState) -> usize {
77        let index = self.states.len();
78        self.states.push(state);
79        index
80    }
81
82    pub fn decision_to_state(&self) -> &[usize] {
83        &self.decision_to_state
84    }
85
86    pub fn add_decision_state(&mut self, state_number: usize) {
87        self.decision_to_state.push(state_number);
88    }
89
90    pub fn rule_to_start_state(&self) -> &[usize] {
91        &self.rule_to_start_state
92    }
93
94    pub fn set_rule_to_start_state(&mut self, rule_to_start_state: Vec<usize>) {
95        self.rule_to_start_state = rule_to_start_state;
96    }
97
98    pub fn rule_to_stop_state(&self) -> &[usize] {
99        &self.rule_to_stop_state
100    }
101
102    pub fn set_rule_to_stop_state(&mut self, rule_to_stop_state: Vec<usize>) {
103        self.rule_to_stop_state = rule_to_stop_state;
104    }
105
106    pub fn rule_to_token_type(&self) -> &[i32] {
107        &self.rule_to_token_type
108    }
109
110    pub fn set_rule_to_token_type(&mut self, rule_to_token_type: Vec<i32>) {
111        self.rule_to_token_type = rule_to_token_type;
112    }
113
114    pub fn mode_to_start_state(&self) -> &[usize] {
115        &self.mode_to_start_state
116    }
117
118    pub fn add_mode_start_state(&mut self, state_number: usize) {
119        self.mode_to_start_state.push(state_number);
120    }
121
122    pub fn lexer_actions(&self) -> &[LexerAction] {
123        &self.lexer_actions
124    }
125
126    pub fn set_lexer_actions(&mut self, lexer_actions: Vec<LexerAction>) {
127        self.lexer_actions = lexer_actions;
128    }
129}
130
131/// A node in the ANTLR ATN graph.
132///
133/// Some ANTLR state subclasses carry references to paired states, such as a
134/// block-start state's end state or a loop-end state's loop-back state. This
135/// representation stores those links as state numbers so the graph remains easy
136/// to clone and serialize in tests.
137#[derive(Clone, Debug, Eq, PartialEq)]
138pub struct AtnState {
139    pub state_number: usize,
140    pub rule_index: Option<usize>,
141    pub kind: AtnStateKind,
142    pub end_state: Option<usize>,
143    pub loop_back_state: Option<usize>,
144    pub non_greedy: bool,
145    pub precedence_rule_decision: bool,
146    pub left_recursive_rule: bool,
147    pub transitions: Vec<Transition>,
148}
149
150impl AtnState {
151    /// Creates an ATN state with no rule index and no outgoing transitions.
152    pub const fn new(state_number: usize, kind: AtnStateKind) -> Self {
153        Self {
154            state_number,
155            rule_index: None,
156            kind,
157            end_state: None,
158            loop_back_state: None,
159            non_greedy: false,
160            precedence_rule_decision: false,
161            left_recursive_rule: false,
162            transitions: Vec::new(),
163        }
164    }
165
166    #[must_use]
167    pub const fn with_rule_index(mut self, rule_index: usize) -> Self {
168        self.rule_index = Some(rule_index);
169        self
170    }
171
172    /// Adds an outgoing transition in serialized order.
173    ///
174    /// Transition order matters for alternatives and lexer priority, so the
175    /// runtime preserves the order emitted by ANTLR.
176    pub fn add_transition(&mut self, transition: Transition) {
177        self.transitions.push(transition);
178    }
179
180    pub fn is_rule_stop(&self) -> bool {
181        self.kind == AtnStateKind::RuleStop
182    }
183}
184
185/// Serialized ANTLR state kind.
186#[derive(Clone, Copy, Debug, Eq, PartialEq)]
187pub enum AtnStateKind {
188    Invalid,
189    Basic,
190    RuleStart,
191    BlockStart,
192    PlusBlockStart,
193    StarBlockStart,
194    TokenStart,
195    RuleStop,
196    BlockEnd,
197    StarLoopBack,
198    StarLoopEntry,
199    PlusLoopBack,
200    LoopEnd,
201}
202
203/// Edge between two ATN states.
204///
205/// Epsilon-like transitions do not consume input. Matching transitions compare
206/// the current input symbol against an atom, range, set, negated set, or
207/// wildcard.
208#[derive(Clone, Debug, Eq, PartialEq)]
209pub enum Transition {
210    Epsilon {
211        target: usize,
212    },
213    Atom {
214        target: usize,
215        label: i32,
216    },
217    Range {
218        target: usize,
219        start: i32,
220        stop: i32,
221    },
222    Set {
223        target: usize,
224        set: IntervalSet,
225    },
226    NotSet {
227        target: usize,
228        set: IntervalSet,
229    },
230    Wildcard {
231        target: usize,
232    },
233    Rule {
234        target: usize,
235        rule_index: usize,
236        follow_state: usize,
237        precedence: i32,
238    },
239    Predicate {
240        target: usize,
241        rule_index: usize,
242        pred_index: usize,
243        context_dependent: bool,
244    },
245    Action {
246        target: usize,
247        rule_index: usize,
248        action_index: Option<usize>,
249        context_dependent: bool,
250    },
251    Precedence {
252        target: usize,
253        precedence: i32,
254    },
255}
256
257impl Transition {
258    /// Returns the target state number for this transition.
259    pub const fn target(&self) -> usize {
260        match self {
261            Self::Epsilon { target }
262            | Self::Atom { target, .. }
263            | Self::Range { target, .. }
264            | Self::Set { target, .. }
265            | Self::NotSet { target, .. }
266            | Self::Wildcard { target }
267            | Self::Rule { target, .. }
268            | Self::Predicate { target, .. }
269            | Self::Action { target, .. }
270            | Self::Precedence { target, .. } => *target,
271        }
272    }
273
274    /// Returns whether traversing this transition consumes no input.
275    pub const fn is_epsilon(&self) -> bool {
276        matches!(
277            self,
278            Self::Epsilon { .. }
279                | Self::Rule { .. }
280                | Self::Predicate { .. }
281                | Self::Action { .. }
282                | Self::Precedence { .. }
283        )
284    }
285
286    /// Tests whether this transition consumes `symbol`.
287    ///
288    /// `min_vocabulary` and `max_vocabulary` define the accepted symbol range
289    /// for wildcard and negated-set transitions.
290    pub fn matches(&self, symbol: i32, min_vocabulary: i32, max_vocabulary: i32) -> bool {
291        match self {
292            Self::Atom { label, .. } => *label == symbol,
293            Self::Range { start, stop, .. } => (*start..=*stop).contains(&symbol),
294            Self::Set { set, .. } => set.contains(symbol),
295            Self::NotSet { set, .. } => {
296                (min_vocabulary..=max_vocabulary).contains(&symbol) && !set.contains(symbol)
297            }
298            Self::Wildcard { .. } => (min_vocabulary..=max_vocabulary).contains(&symbol),
299            Self::Epsilon { .. }
300            | Self::Rule { .. }
301            | Self::Predicate { .. }
302            | Self::Action { .. }
303            | Self::Precedence { .. } => false,
304        }
305    }
306}
307
308/// Ordered set of integer intervals used by set and negated-set transitions.
309///
310/// Unicode grammars can contain very large ranges, so this stores normalized
311/// intervals rather than expanding every code point into a flat set.
312#[derive(Clone, Debug, Default, Eq, PartialEq)]
313pub struct IntervalSet {
314    ranges: Vec<(i32, i32)>,
315}
316
317impl IntervalSet {
318    pub fn new() -> Self {
319        Self::default()
320    }
321
322    pub fn from_range(start: i32, stop: i32) -> Self {
323        let mut set = Self::new();
324        set.add_range(start, stop);
325        set
326    }
327
328    pub fn add(&mut self, value: i32) {
329        self.add_range(value, value);
330    }
331
332    /// Adds an inclusive interval and merges it with adjacent or overlapping
333    /// intervals.
334    pub fn add_range(&mut self, start: i32, stop: i32) {
335        let (start, stop) = if start <= stop {
336            (start, stop)
337        } else {
338            (stop, start)
339        };
340        self.ranges.push((start, stop));
341        self.normalize();
342    }
343
344    /// Re-sorts and coalesces interval storage after insertion.
345    fn normalize(&mut self) {
346        self.ranges.sort_unstable();
347        let mut merged: Vec<(i32, i32)> = Vec::with_capacity(self.ranges.len());
348        for (start, stop) in self.ranges.drain(..) {
349            if let Some((_, last_stop)) = merged.last_mut() {
350                if start <= last_stop.saturating_add(1) {
351                    *last_stop = (*last_stop).max(stop);
352                    continue;
353                }
354            }
355            merged.push((start, stop));
356        }
357        self.ranges = merged;
358    }
359
360    /// Returns true when `value` falls inside any stored interval.
361    pub fn contains(&self, value: i32) -> bool {
362        self.ranges
363            .iter()
364            .any(|(start, stop)| (*start..=*stop).contains(&value))
365    }
366
367    pub fn ranges(&self) -> &[(i32, i32)] {
368        &self.ranges
369    }
370
371    pub const fn is_empty(&self) -> bool {
372        self.ranges.is_empty()
373    }
374}
375
376/// Serialized lexer action attached to an action transition.
377///
378/// These actions are grammar-independent operations generated by ANTLR's lexer
379/// commands (`skip`, `more`, `type`, `channel`, `pushMode`, `popMode`, and
380/// `mode`). Custom embedded actions are represented but intentionally inert
381/// until a generated semantic-action hook exists.
382#[derive(Clone, Debug, Eq, PartialEq)]
383pub enum LexerAction {
384    Channel(i32),
385    Custom { rule_index: i32, action_index: i32 },
386    Mode(i32),
387    More,
388    PopMode,
389    PushMode(i32),
390    Skip,
391    Type(i32),
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397
398    #[test]
399    fn interval_set_handles_ranges() {
400        let set = IntervalSet::from_range(2, 4);
401        assert!(set.contains(2));
402        assert!(set.contains(3));
403        assert!(set.contains(4));
404        assert!(!set.contains(5));
405        assert_eq!(set.ranges(), &[(2, 4)]);
406    }
407}