Skip to main content

lexigram_lib/
symbol_table.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3#[cfg(any())]
4use std::collections::HashMap;
5use crate::{NameFixer, indent_source};
6use crate::fixed_sym_table::{FixedSymTable, SymInfoTable};
7use crate::{TokenId, VarId};
8use crate::parser::Symbol;
9use lexigram_core::CollectJoin;
10
11// NOTE: nonterminal-to-ID functionality currently disabled by #[cfg(any())]
12
13/// Stores the names of the terminal and nonterminal symbols when building a parser.
14///
15/// Terminals are defined in the lexicon and don't change. They have two parts to their name:
16/// - the identifier in the lexicon
17/// - the value, which is the source string they represent (optional)
18///
19/// For example:
20/// ```lexicon
21/// Plus : '+';
22/// ...
23/// ID    : [a-zA-Z][a-zA-Z_0-9]*;
24/// ```
25///
26/// If `Arrow`'s token ID is 0 and `ID`'s is 24,
27/// ```ignore
28/// t[0] = ("Plus".to_string(), Some("+".to_string()));
29/// t[24] = ("ID".to_string(), None);
30/// ```
31///
32/// They're added to the symbol table with [`add_terminal()`](SymbolTable::add_terminal).
33///
34/// Nonterminals are defined in the grammar, and possibly completed by new ones when
35/// the rules are adapted to the target parser. For example, recursive rules are
36/// transformed for LL(1) parsers, which usually adds extra rules.
37///
38/// ```grammar
39/// expr: expr Plus term | term;
40/// ```
41/// If `expr` is 0 and `term` is 1,
42/// ```ignore
43/// nt[0] = "expr".to_string();
44/// nt[1] = "term".to_string();
45/// ```
46/// They're added with [`add_nonterminal`](SymbolTable::add_nonterminal).
47///
48/// The new rules are called "children" of the transformed rules, and can be added
49/// with [`add_child_nonterminal()`](SymbolTable::add_child_nonterminal). The name
50/// is the parent's name followed by "`_<number>`". For example, adding a child to
51/// nonterminal 0 creates
52///
53/// ```ignore
54/// nt[2] = "expr_1".to_string()
55/// ```
56///
57#[derive(Clone, Debug)]
58pub struct SymbolTable {
59    t: Vec<(String, Option<String>)>,   // terminal identifiers and optional representation
60    fixer_t: NameFixer,                 // keeps terminal identifiers unique
61    nt: Vec<String>,                    // nt to nonterminal identifier
62    #[cfg(any())]
63    names: HashMap<String, VarId>,      // nonterminal identifier to nt
64    fixer_nt: NameFixer,                // keeps nonterminal identifiers unique
65}
66
67impl SymbolTable {
68    pub fn new() -> Self {
69        SymbolTable {
70            t: Vec::new(),
71            fixer_t: NameFixer::new_empty(),
72            nt: Vec::new(),
73            #[cfg(any())]
74            names: HashMap::new(),
75            fixer_nt: NameFixer::new_empty(),
76        }
77    }
78
79    pub fn to_fixed_sym_table(self) -> FixedSymTable {
80        FixedSymTable::new(self.t, self.nt)
81    }
82
83    // -------------------------------------------------------------------------
84
85    pub fn add_terminal<T: Into<String>>(&mut self, name: T, value_maybe: Option<T>) -> TokenId {
86        let token = self.t.len();
87        assert!(token < TokenId::MAX as usize);
88        let unique_name = self.fixer_t.get_unique_name(name.into());
89        self.t.push((unique_name, value_maybe.map(|n| n.into())));
90        token as TokenId
91    }
92
93    pub fn extend_terminals<I: IntoIterator<Item=(T, Option<T>)>, T: Into<String>>(&mut self, terminals: I) {
94        for (s, maybe) in terminals {
95            self.add_terminal(s, maybe);
96        }
97    }
98
99    pub fn get_terminals(&self) -> impl Iterator<Item = &(String, Option<String>)> {
100        self.t.iter()
101    }
102
103    pub fn get_num_t(&self) -> usize {
104        self.t.len()
105    }
106
107    pub fn set_t_value(&mut self, token: TokenId, name_maybe: Option<String>) {
108        self.t[token as usize].1 = name_maybe
109    }
110
111    pub fn downsize_num_t(&mut self, num_t: usize) {
112        if num_t < self.t.len() {
113            for (name, _) in &mut self.t[num_t..] {
114                self.fixer_t.remove(name);
115            }
116            self.t.resize(num_t, (String::new(), None));
117        }
118    }
119
120    // -------------------------------------------------------------------------
121
122    fn add_nt(&mut self, unique_name: String) -> VarId {
123        let var = self.nt.len();
124        assert!(var < VarId::MAX as usize);
125        #[cfg(any())]
126        self.names.insert(unique_name.clone(), var as VarId);
127        self.nt.push(unique_name);
128        var as VarId
129    }
130
131    pub fn add_nonterminal<T: Into<String>>(&mut self, name: T) -> VarId {
132        let unique_name = self.fixer_nt.get_unique_name(name.into());
133        self.add_nt(unique_name)
134    }
135
136    pub fn add_child_nonterminal(&mut self, var: VarId) -> VarId {
137        let unique_name = self.fixer_nt.get_unique_name_unum(self.nt[var as usize].clone());
138        self.add_nt(unique_name)
139    }
140    
141    pub fn extend_nonterminals<I: IntoIterator<Item=T>, T: Into<String>>(&mut self, nonterminals: I) {
142        for s in nonterminals {
143            self.add_nonterminal(s);
144        }
145    }
146
147    #[cfg(any())]
148    pub fn find_nonterminal(&self, name: &str) -> Option<VarId> {
149        self.names.get(name).cloned()
150    }
151
152    pub fn get_nonterminals(&self) -> impl Iterator<Item = &String> {
153        self.nt.iter()
154    }
155
156    pub fn get_num_nt(&self) -> usize {
157        self.nt.len()
158    }
159
160    pub fn remove_nonterminal(&mut self, v: VarId) {
161        let name = self.nt.remove(v as usize);
162        #[cfg(any())]
163        {
164            for old_v in self.names.values_mut() {
165                if *old_v >= v { *old_v -= 1; }
166            }
167            self.names.remove(&name);
168        }
169        self.fixer_nt.remove(&name);
170    }
171
172    pub fn set_nt_name(&mut self, var: VarId, name: String) {
173        self.nt[var as usize] = name;
174    }
175
176    pub fn find_nt(&self, name: &str) -> Option<VarId> {
177        self.nt.iter().position(|s| s == name).map(|n| n as VarId)
178    }
179
180    /// Removes the name assigned to NT `var` and returns it. Internally, the name of the NT is
181    /// replaced by another unique string. The NT is expected to be removed later.
182    pub fn remove_nt_name(&mut self, var: VarId) -> String {
183        let mut removed = self.fixer_nt.get_unique_name_num(format!("{var}_removed"));
184        std::mem::swap(&mut self.nt[var as usize], &mut removed);
185        #[cfg(any())]
186        self.names.remove(&removed);
187        self.fixer_nt.remove(&removed);
188        removed
189    }
190
191    pub fn gen_source_code_t(&self, indent: usize, has_enum: bool, has_array: bool) -> String {
192        let mut source = Vec::<String>::new();
193        let (max_n, max_option) = self.get_terminals().fold((0, 0), |(n, option), t| {
194            (n.max(t.0.len()), option.max(t.1.as_ref().map_or(0, |o| o.len())))
195        });
196        if has_enum {
197            // Arrow = 0,  // 0
198            // Colon,      // 1
199            source.push("#[repr(u16)]".to_string());
200            source.push("enum T {".to_string());
201            for (i, (n, _option)) in self.get_terminals().enumerate() {
202                source.push(format!("    {n:max_n$}{} // {i}", if i == 0 { " = 0," } else { ",    " }));
203            }
204            source.push("}".to_string());
205        }
206        if has_enum && has_array {
207            source.push(String::new());
208        }
209        if has_array {
210            // ("Arrow",     Some("->")),          // 0
211            // ("Colon",     Some(":")),           // 1
212            source.push(format!("static TERMINALS: [(&str, Option<&str>); {}] = [", self.get_num_t()));
213            for (i, (n, option)) in self.get_terminals().enumerate() {
214                source.push(format!("    (\"{n}\",{:w1$}{}),{:w2$} // {i}",
215                                    "",
216                                    if let Some(o) = option { format!("Some(\"{o}\")") } else { "None".to_string() },
217                                    "",
218                                    w1 = max_n - n.len(), w2 = max_option + 4 - option.as_ref().map_or(0, |o| o.len() + 4)));
219            }
220            source.push("];".to_string());
221        }
222        indent_source(vec![source], indent)
223    }
224
225    // -------------------------------------------------------------------------
226
227    pub fn dump_str(&self) -> String {
228        let mut str = format!(
229            "  - nonterminals:\n{}\n",
230            self.get_nonterminals().enumerate().map(|(v, s)| format!("    - NT[{v}]: {s}")).join("\n"));
231        str.push_str(&format!(
232            "  - terminals:\n{}",
233            self.get_terminals().enumerate()
234                .map(|(t, (n, v_maybe))| format!("    - T[{t}]: {n}{}", if let Some(v) = v_maybe { format!(" = {v:?}") } else { String::new() }))
235                .join("\n")));
236        str
237    }
238
239    pub fn dump(&self, title: &str) {
240        if !title.is_empty() {
241            println!("{title}");
242        }
243        println!("{}", self.dump_str());
244    }
245}
246
247impl SymInfoTable for SymbolTable {
248    fn is_token_data(&self, token: TokenId) -> bool {
249        self.t.get(token as usize).map(|t| t.1.is_none()).unwrap_or(false)
250    }
251
252    fn is_symbol_t_data(&self, symbol: &Symbol) -> bool {
253        if let Symbol::T(token) = symbol {
254            self.t.get(*token as usize).map(|t| t.1.is_none()).unwrap_or(false)
255        } else {
256            false
257        }
258    }
259
260    fn is_symbol_t_fixed(&self, symbol: &Symbol) -> bool {
261        if let Symbol::T(token) = symbol {
262            if (*token as usize) < self.t.len() {
263                self.t.get(*token as usize).map(|t| t.1.is_some()).unwrap_or(false)
264            } else {
265                false
266            }
267        } else {
268            false
269        }
270    }
271
272    fn get_t_str(&self, token: TokenId) -> String {
273        match token {
274            _ if (token as usize) < self.t.len() => {
275                let (name, literal) = &self.t[token as usize];
276                literal.as_ref().unwrap_or(name).clone()
277            }
278            TokenId::MAX => "<bad character>".to_string(),
279            _ => format!("T({token}?)")
280        }
281    }
282
283    fn get_t_name(&self, token: TokenId) -> String {
284        if token as usize >= self.t.len() {
285            format!("T({token}?)")
286        } else {
287            self.t[token as usize].0.clone()
288        }
289    }
290
291    fn get_nt_name(&self, var: VarId) -> String {
292        if var as usize >= self.nt.len() { return format!("NT({var}?)") }
293        self.nt[var as usize].clone()
294    }
295
296    fn get_name(&self, symbol: &Symbol) -> String {
297        match symbol {
298            Symbol::Empty | Symbol::End => symbol.to_string(),
299            Symbol::T(token) => self.get_t_name(*token),
300            Symbol::NT(var) => self.get_nt_name(*var),
301        }
302    }
303
304    fn get_str(&self, symbol: &Symbol) -> String {
305        match symbol {
306            Symbol::Empty | Symbol::End => symbol.to_string(),
307            Symbol::T(token) => self.get_t_str(*token),
308            Symbol::NT(var) => self.get_nt_name(*var),
309        }
310    }
311
312    fn get_name_quote(&self, symbol: &Symbol) -> String {
313        match symbol {
314            Symbol::Empty | Symbol::End => symbol.to_string(),
315            Symbol::T(token) => if self.is_symbol_t_fixed(symbol) { format!("{:?}", self.get_t_str(*token)) } else { self.get_t_str(*token) },
316            Symbol::NT(var) => self.get_nt_name(*var),
317        }
318    }
319}
320
321impl Default for SymbolTable {
322    fn default() -> Self {
323        SymbolTable::new()
324    }
325}
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330
331    #[test]
332    fn general() {
333        let mut st = SymbolTable::new();
334        st.extend_nonterminals(["A".to_string(), "NOT_USED".to_string(), "E".to_string()]);
335        assert_eq!(st.add_child_nonterminal(0), 3);
336        assert_eq!(st.get_str(&Symbol::NT(3)), "A_1");
337        assert_eq!(st.add_child_nonterminal(0), 4);
338        assert_eq!(st.get_str(&Symbol::NT(4)), "A_2");
339        assert_eq!(st.add_child_nonterminal(2), 5);
340        assert_eq!(st.get_str(&Symbol::NT(5)), "E_1");
341        assert_eq!(st.add_child_nonterminal(1), 6);
342        st.remove_nonterminal(1);
343        assert_eq!(st.get_str(&Symbol::NT(2)), "A_1");
344        assert_eq!(st.get_str(&Symbol::NT(3)), "A_2");
345        assert_eq!(st.get_str(&Symbol::NT(4)), "E_1");
346        #[cfg(any())]
347        assert_eq!(st.find_nonterminal("A_1"), Some(2));
348        assert!(st.nt.contains(&"A_2".to_string()));
349        #[cfg(any())]
350        assert!(st.names.contains_key("A_2"));
351        assert!(st.fixer_nt.contains("A_2"));
352        st.remove_nt_name(3);
353        assert!(!st.nt.contains(&"A_2".to_string()));
354        #[cfg(any())]
355        assert!(!st.names.contains_key("A_2"));
356        assert!(!st.fixer_nt.contains("A_2"));
357    }
358
359    #[test]
360    fn terminals() {
361        let mut st = SymbolTable::new();
362        let tnames = vec![("a", Some("aa")), ("b", Some("bb")), ("a", Some("A")), ("c", None), ("d", None)];
363        st.extend_terminals(tnames);
364        assert_eq!(st.get_num_t(), 5);
365        let result = st.get_terminals().map(|(s, v)| (s.as_str(), v.as_ref().map(|s| s.as_str()))).to_vec();
366        assert_eq!(result, vec![("a", Some("aa")), ("b", Some("bb")), ("a1", Some("A")), ("c", None), ("d", None)]);
367        for name in vec!["a1", "c", "d"] {
368            assert_eq!(st.fixer_t.contains(name), true);
369        }
370        st.downsize_num_t(2);
371        let result = st.get_terminals().map(|(s, v)| (s.as_str(), v.as_ref().map(|s| s.as_str()))).to_vec();
372        assert_eq!(result, vec![("a", Some("aa")), ("b", Some("bb"))]);
373        assert_eq!(st.get_num_t(), 2);
374        for name in vec!["a1", "c", "d"] {
375            assert_eq!(st.fixer_t.contains(name), false);
376        }
377        let extra_tnames = vec![("a", Some("A")), ("c", None), ("d", None)];
378        st.extend_terminals(extra_tnames);
379        let result = st.get_terminals().map(|(s, v)| (s.as_str(), v.as_ref().map(|s| s.as_str()))).to_vec();
380        assert_eq!(result, vec![("a", Some("aa")), ("b", Some("bb")), ("a1", Some("A")), ("c", None), ("d", None)]);
381    }
382}