lexigram_core/
fixed_sym_table.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3use crate::parser::Symbol;
4use crate::{TokenId, VarId};
5
6/// Stores the names of the terminal and nonterminal symbols used by a parser.
7///
8/// Terminals are defined in the lexicon. They have two parts to their name:
9/// - the identifier in the lexicon
10/// - the source string they represent (optional)
11///
12/// For example:
13/// ```lexicon
14/// Plus : '+';
15/// ...
16/// ID    : [a-zA-Z][a-zA-Z_0-9]*;
17/// ```
18///
19/// If `Arrow`'s token ID is 0 and `ID`'s is 24,
20/// ```ignore
21/// t[0] = ("Plus".to_string(), Some("+".to_string()));
22/// t[24] = ("ID".to_string(), None);
23/// ```
24///
25/// Nonterminals are defined in the grammar, and possibly completed by new ones when
26/// the rules are adapted to the target parser. For example, recursive rules are
27/// transformed for LL(1) parsers, which usually adds extra rules.
28///
29/// ```grammar
30/// expr: expr Plus term | term;
31/// ```
32/// If `expr` is 0 and `term` is 1,
33/// ```ignore
34/// nt[0] = "expr".to_string();
35/// nt[1] = "term".to_string();
36/// nt[2] = "expr_1".to_string(); // generated when removing the left recursion
37/// ```
38#[derive(Clone, Debug)]
39pub struct FixedSymTable {
40    t: Vec<(String, Option<String>)>,   // terminal identifiers and optional representation
41    nt: Vec<String>,                    // nt to nonterminal identifier
42}
43
44impl FixedSymTable {
45    pub fn new(t: Vec<(String, Option<String>)>, nt: Vec<String>) -> Self {
46        FixedSymTable { t, nt }
47    }
48
49    // -------------------------------------------------------------------------
50
51    pub fn get_terminals(&self) -> impl Iterator<Item = &(String, Option<String>)> {
52        self.t.iter()
53    }
54
55    pub fn get_num_t(&self) -> usize {
56        self.t.len()
57    }
58
59    // -------------------------------------------------------------------------
60
61    pub fn get_nonterminals(&self) -> impl Iterator<Item = &String> {
62        self.nt.iter()
63    }
64
65    pub fn get_num_nt(&self) -> usize {
66        self.nt.len()
67    }
68
69    // -------------------------------------------------------------------------
70
71    #[cfg(test)]
72    pub fn dump(&self, title: &str) {
73        if !title.is_empty() {
74            println!("{title}");
75        }
76        println!(
77            "- nonterminals:\n{}",
78            self.get_nonterminals().enumerate().map(|(v, s)| format!("  - NT[{v}]: {s}")).collect::<Vec<_>>().join("\n"));
79        println!(
80            "- terminals:\n{}",
81            self.get_terminals().enumerate()
82                .map(|(t, (n, v_maybe))| format!("  - T[{t}]: {n}{}", if let Some(v) = v_maybe { format!(" = {v:?}") } else { String::new() }))
83                .collect::<Vec<_>>().join("\n"));
84    }
85}
86
87pub trait SymInfoTable {
88    /// Does `Symbol::T(token)` hold lexer string data?
89    ///
90    /// Terminals are divided into two categories: fixed and variable content. When the
91    /// terminal is defined with choices and ranges of characters, like `ID: [a-z]+`, it
92    /// contains variable content: data like the ID specifier.
93    fn is_token_data(&self, token: TokenId) -> bool;
94
95    /// Is `symbol` a terminal holding lexer string data?
96    ///
97    /// Terminals are divided into two categories: fixed and variable content. When the
98    /// terminal is defined with choices and ranges of characters, like `ID: [a-z]+`, it
99    /// contains variable content: data like the ID specifier.
100    fn is_symbol_t_data(&self, symbol: &Symbol) -> bool;
101
102    fn is_symbol_t_fixed(&self, symbol: &Symbol) -> bool;
103
104    fn get_t_str(&self, token: TokenId) -> String;
105
106    fn get_t_name(&self, token: TokenId) -> String;
107
108    fn get_nt_name(&self, var: VarId) -> String;
109
110    /// Gets the symbol's name: the nonterminal identifier, the terminal identifier,
111    /// or "ε", "$", ...
112    fn get_name(&self, symbol: &Symbol) -> String;
113
114    /// Gets the symbol's representation string: the nonterminal identifier, the
115    /// terminal string value (if it exists), or "ε", "$", ...
116    fn get_str(&self, symbol: &Symbol) -> String;
117
118    fn get_name_quote(&self, symbol: &Symbol) -> String;
119}
120
121impl SymInfoTable for FixedSymTable {
122    fn is_token_data(&self, token: TokenId) -> bool {
123        self.t[token as usize].1.is_none()
124    }
125
126    fn is_symbol_t_data(&self, symbol: &Symbol) -> bool {
127        if let Symbol::T(token) = symbol {
128            self.t.get(*token as usize).map(|t| t.1.is_none()).unwrap_or(false)
129        } else {
130            false
131        }
132    }
133
134    fn is_symbol_t_fixed(&self, symbol: &Symbol) -> bool {
135        if let Symbol::T(token) = symbol {
136            self.t.get(*token as usize).map(|t| t.1.is_some()).unwrap_or(false)
137        } else {
138            false
139        }
140    }
141
142    fn get_t_str(&self, token: TokenId) -> String {
143        match token {
144            _ if (token as usize) < self.t.len() => {
145                let (name, literal) = &self.t[token as usize];
146                literal.as_ref().unwrap_or(name).clone()
147            }
148            TokenId::MAX => "<bad character>".to_string(),
149            _ => format!("T({token}?)")
150        }
151    }
152
153    fn get_t_name(&self, token: TokenId) -> String {
154        if token as usize >= self.t.len() {
155            format!("T({token}?)")
156        } else {
157            self.t[token as usize].0.clone()
158        }
159    }
160
161    fn get_nt_name(&self, var: VarId) -> String {
162        if var as usize >= self.nt.len() { return format!("NT({var}?)") }
163        self.nt[var as usize].clone()
164    }
165
166    fn get_name(&self, symbol: &Symbol) -> String {
167        match symbol {
168            Symbol::Empty | Symbol::End => symbol.to_string(),
169            Symbol::T(token) => self.get_t_name(*token),
170            Symbol::NT(var) => self.get_nt_name(*var),
171        }
172    }
173
174    fn get_str(&self, symbol: &Symbol) -> String {
175        match symbol {
176            Symbol::Empty | Symbol::End => symbol.to_string(),
177            Symbol::T(token) => self.get_t_str(*token),
178            Symbol::NT(var) => self.get_nt_name(*var),
179        }
180    }
181
182    fn get_name_quote(&self, symbol: &Symbol) -> String {
183        match symbol {
184            Symbol::Empty | Symbol::End => symbol.to_string(),
185            Symbol::T(token) => if self.is_symbol_t_fixed(symbol) { format!("{:?}", self.get_t_str(*token)) } else { self.get_t_str(*token) },
186            Symbol::NT(var) => self.get_nt_name(*var),
187        }
188    }
189}