Skip to main content

lexigram_lib/
symbol_table.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3#[cfg(any())]
4use std::collections::HashMap;
5use crate::{NameFixer, indent_source};
6use crate::fixed_sym_table::{FixedSymTable, SymInfoTable};
7use crate::{TokenId, VarId};
8use crate::parser::Symbol;
9#[cfg(test)]
10use lexigram_core::CollectJoin;
11
12// NOTE: nonterminal-to-ID functionality currently disabled by #[cfg(any())]
13
14/// Stores the names of the terminal and nonterminal symbols when building a parser.
15///
16/// Terminals are defined in the lexicon and don't change. They have two parts to their name:
17/// - the identifier in the lexicon
18/// - the value, which is the source string they represent (optional)
19///
20/// For example:
21/// ```lexicon
22/// Plus : '+';
23/// ...
24/// ID    : [a-zA-Z][a-zA-Z_0-9]*;
25/// ```
26///
27/// If `Arrow`'s token ID is 0 and `ID`'s is 24,
28/// ```ignore
29/// t[0] = ("Plus".to_string(), Some("+".to_string()));
30/// t[24] = ("ID".to_string(), None);
31/// ```
32///
33/// They're added to the symbol table with [`add_terminal()`](SymbolTable::add_terminal).
34///
35/// Nonterminals are defined in the grammar, and possibly completed by new ones when
36/// the rules are adapted to the target parser. For example, recursive rules are
37/// transformed for LL(1) parsers, which usually adds extra rules.
38///
39/// ```grammar
40/// expr: expr Plus term | term;
41/// ```
42/// If `expr` is 0 and `term` is 1,
43/// ```ignore
44/// nt[0] = "expr".to_string();
45/// nt[1] = "term".to_string();
46/// ```
47/// They're added with [`add_nonterminal`](SymbolTable::add_nonterminal).
48///
49/// The new rules are called "children" of the transformed rules, and can be added
50/// with [`add_child_nonterminal()`](SymbolTable::add_child_nonterminal). The name
51/// is the parent's name followed by "`_<number>`". For example, adding a child to
52/// nonterminal 0 creates
53///
54/// ```ignore
55/// nt[2] = "expr_1".to_string()
56/// ```
57///
58#[derive(Clone, Debug)]
59pub struct SymbolTable {
60    t: Vec<(String, Option<String>)>,   // terminal identifiers and optional representation
61    fixer_t: NameFixer,                 // keeps terminal identifiers unique
62    nt: Vec<String>,                    // nt to nonterminal identifier
63    #[cfg(any())]
64    names: HashMap<String, VarId>,      // nonterminal identifier to nt
65    fixer_nt: NameFixer,                // keeps nonterminal identifiers unique
66}
67
68impl SymbolTable {
69    pub fn new() -> Self {
70        SymbolTable {
71            t: Vec::new(),
72            fixer_t: NameFixer::new_empty(),
73            nt: Vec::new(),
74            #[cfg(any())]
75            names: HashMap::new(),
76            fixer_nt: NameFixer::new_empty(),
77        }
78    }
79
80    pub fn to_fixed_sym_table(self) -> FixedSymTable {
81        FixedSymTable::new(self.t, self.nt)
82    }
83
84    // -------------------------------------------------------------------------
85
86    pub fn add_terminal<T: Into<String>>(&mut self, name: T, value_maybe: Option<T>) -> TokenId {
87        let token = self.t.len();
88        assert!(token < TokenId::MAX as usize);
89        let unique_name = self.fixer_t.get_unique_name(name.into());
90        self.t.push((unique_name, value_maybe.map(|n| n.into())));
91        token as TokenId
92    }
93
94    pub fn extend_terminals<I: IntoIterator<Item=(T, Option<T>)>, T: Into<String>>(&mut self, terminals: I) {
95        for (s, maybe) in terminals {
96            self.add_terminal(s, maybe);
97        }
98    }
99
100    pub fn get_terminals(&self) -> impl Iterator<Item = &(String, Option<String>)> {
101        self.t.iter()
102    }
103
104    pub fn get_num_t(&self) -> usize {
105        self.t.len()
106    }
107
108    pub fn set_t_value(&mut self, token: TokenId, name_maybe: Option<String>) {
109        self.t[token as usize].1 = name_maybe
110    }
111
112    pub fn downsize_num_t(&mut self, num_t: usize) {
113        if num_t < self.t.len() {
114            for (name, _) in &mut self.t[num_t..] {
115                self.fixer_t.remove(name);
116            }
117            self.t.resize(num_t, (String::new(), None));
118        }
119    }
120
121    // -------------------------------------------------------------------------
122
123    fn add_nt(&mut self, unique_name: String) -> VarId {
124        let var = self.nt.len();
125        assert!(var < VarId::MAX as usize);
126        #[cfg(any())]
127        self.names.insert(unique_name.clone(), var as VarId);
128        self.nt.push(unique_name);
129        var as VarId
130    }
131
132    pub fn add_nonterminal<T: Into<String>>(&mut self, name: T) -> VarId {
133        let unique_name = self.fixer_nt.get_unique_name(name.into());
134        self.add_nt(unique_name)
135    }
136
137    pub fn add_child_nonterminal(&mut self, var: VarId) -> VarId {
138        let unique_name = self.fixer_nt.get_unique_name_unum(self.nt[var as usize].clone());
139        self.add_nt(unique_name)
140    }
141    
142    pub fn extend_nonterminals<I: IntoIterator<Item=T>, T: Into<String>>(&mut self, nonterminals: I) {
143        for s in nonterminals {
144            self.add_nonterminal(s);
145        }
146    }
147
148    #[cfg(any())]
149    pub fn find_nonterminal(&self, name: &str) -> Option<VarId> {
150        self.names.get(name).cloned()
151    }
152
153    pub fn get_nonterminals(&self) -> impl Iterator<Item = &String> {
154        self.nt.iter()
155    }
156
157    pub fn get_num_nt(&self) -> usize {
158        self.nt.len()
159    }
160
161    pub fn remove_nonterminal(&mut self, v: VarId) {
162        let name = self.nt.remove(v as usize);
163        #[cfg(any())]
164        {
165            for old_v in self.names.values_mut() {
166                if *old_v >= v { *old_v -= 1; }
167            }
168            self.names.remove(&name);
169        }
170        self.fixer_nt.remove(&name);
171    }
172
173    pub fn set_nt_name(&mut self, var: VarId, name: String) {
174        self.nt[var as usize] = name;
175    }
176
177    pub fn find_nt(&self, name: &str) -> Option<VarId> {
178        self.nt.iter().position(|s| s == name).map(|n| n as VarId)
179    }
180
181    /// Removes the name assigned to NT `var` and returns it. Internally, the name of the NT is
182    /// replaced by another unique string. The NT is expected to be removed later.
183    pub fn remove_nt_name(&mut self, var: VarId) -> String {
184        let mut removed = self.fixer_nt.get_unique_name_num(format!("{var}_removed"));
185        std::mem::swap(&mut self.nt[var as usize], &mut removed);
186        #[cfg(any())]
187        self.names.remove(&removed);
188        self.fixer_nt.remove(&removed);
189        removed
190    }
191
192    pub fn gen_source_code_t(&self, indent: usize, has_enum: bool, has_array: bool) -> String {
193        let mut source = Vec::<String>::new();
194        let (max_n, max_option) = self.get_terminals().fold((0, 0), |(n, option), t| {
195            (n.max(t.0.len()), option.max(t.1.as_ref().map_or(0, |o| o.len())))
196        });
197        if has_enum {
198            // Arrow = 0,  // 0
199            // Colon,      // 1
200            source.push("#[repr(u16)]".to_string());
201            source.push("enum T {".to_string());
202            for (i, (n, _option)) in self.get_terminals().enumerate() {
203                source.push(format!("    {n:max_n$}{} // {i}", if i == 0 { " = 0," } else { ",    " }));
204            }
205            source.push("}".to_string());
206        }
207        if has_enum && has_array {
208            source.push(String::new());
209        }
210        if has_array {
211            // ("Arrow",     Some("->")),          // 0
212            // ("Colon",     Some(":")),           // 1
213            source.push(format!("static TERMINALS: [(&str, Option<&str>); {}] = [", self.get_num_t()));
214            for (i, (n, option)) in self.get_terminals().enumerate() {
215                source.push(format!("    (\"{n}\",{:w1$}{}),{:w2$} // {i}",
216                                    "",
217                                    if let Some(o) = option { format!("Some(\"{o}\")") } else { "None".to_string() },
218                                    "",
219                                    w1 = max_n - n.len(), w2 = max_option + 4 - option.as_ref().map_or(0, |o| o.len() + 4)));
220            }
221            source.push("];".to_string());
222        }
223        indent_source(vec![source], indent)
224    }
225
226    // -------------------------------------------------------------------------
227
228    #[cfg(test)]
229    pub fn dump(&self, title: &str) {
230        if !title.is_empty() {
231            println!("{title}");
232        }
233        println!(
234            "- nonterminals:\n{}",
235            self.get_nonterminals().enumerate().map(|(v, s)| format!("  - NT[{v}]: {s}")).join("\n"));
236        println!(
237            "- terminals:\n{}",
238            self.get_terminals().enumerate()
239                .map(|(t, (n, v_maybe))| format!("  - T[{t}]: {n}{}", if let Some(v) = v_maybe { format!(" = {v:?}") } else { String::new() }))
240                .join("\n"));
241
242    }
243}
244
245impl SymInfoTable for SymbolTable {
246    fn is_token_data(&self, token: TokenId) -> bool {
247        self.t.get(token as usize).map(|t| t.1.is_none()).unwrap_or(false)
248    }
249
250    fn is_symbol_t_data(&self, symbol: &Symbol) -> bool {
251        if let Symbol::T(token) = symbol {
252            self.t.get(*token as usize).map(|t| t.1.is_none()).unwrap_or(false)
253        } else {
254            false
255        }
256    }
257
258    fn is_symbol_t_fixed(&self, symbol: &Symbol) -> bool {
259        if let Symbol::T(token) = symbol {
260            if (*token as usize) < self.t.len() {
261                self.t.get(*token as usize).map(|t| t.1.is_some()).unwrap_or(false)
262            } else {
263                false
264            }
265        } else {
266            false
267        }
268    }
269
270    fn get_t_str(&self, token: TokenId) -> String {
271        match token {
272            _ if (token as usize) < self.t.len() => {
273                let (name, literal) = &self.t[token as usize];
274                literal.as_ref().unwrap_or(name).clone()
275            }
276            TokenId::MAX => "<bad character>".to_string(),
277            _ => format!("T({token}?)")
278        }
279    }
280
281    fn get_t_name(&self, token: TokenId) -> String {
282        if token as usize >= self.t.len() {
283            format!("T({token}?)")
284        } else {
285            self.t[token as usize].0.clone()
286        }
287    }
288
289    fn get_nt_name(&self, var: VarId) -> String {
290        if var as usize >= self.nt.len() { return format!("NT({var}?)") }
291        self.nt[var as usize].clone()
292    }
293
294    fn get_name(&self, symbol: &Symbol) -> String {
295        match symbol {
296            Symbol::Empty | Symbol::End => symbol.to_string(),
297            Symbol::T(token) => self.get_t_name(*token),
298            Symbol::NT(var) => self.get_nt_name(*var),
299        }
300    }
301
302    fn get_str(&self, symbol: &Symbol) -> String {
303        match symbol {
304            Symbol::Empty | Symbol::End => symbol.to_string(),
305            Symbol::T(token) => self.get_t_str(*token),
306            Symbol::NT(var) => self.get_nt_name(*var),
307        }
308    }
309
310    fn get_name_quote(&self, symbol: &Symbol) -> String {
311        match symbol {
312            Symbol::Empty | Symbol::End => symbol.to_string(),
313            Symbol::T(token) => if self.is_symbol_t_fixed(symbol) { format!("{:?}", self.get_t_str(*token)) } else { self.get_t_str(*token) },
314            Symbol::NT(var) => self.get_nt_name(*var),
315        }
316    }
317}
318
319impl Default for SymbolTable {
320    fn default() -> Self {
321        SymbolTable::new()
322    }
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328
329    #[test]
330    fn general() {
331        let mut st = SymbolTable::new();
332        st.extend_nonterminals(["A".to_string(), "NOT_USED".to_string(), "E".to_string()]);
333        assert_eq!(st.add_child_nonterminal(0), 3);
334        assert_eq!(st.get_str(&Symbol::NT(3)), "A_1");
335        assert_eq!(st.add_child_nonterminal(0), 4);
336        assert_eq!(st.get_str(&Symbol::NT(4)), "A_2");
337        assert_eq!(st.add_child_nonterminal(2), 5);
338        assert_eq!(st.get_str(&Symbol::NT(5)), "E_1");
339        assert_eq!(st.add_child_nonterminal(1), 6);
340        st.remove_nonterminal(1);
341        assert_eq!(st.get_str(&Symbol::NT(2)), "A_1");
342        assert_eq!(st.get_str(&Symbol::NT(3)), "A_2");
343        assert_eq!(st.get_str(&Symbol::NT(4)), "E_1");
344        #[cfg(any())]
345        assert_eq!(st.find_nonterminal("A_1"), Some(2));
346        assert!(st.nt.contains(&"A_2".to_string()));
347        #[cfg(any())]
348        assert!(st.names.contains_key("A_2"));
349        assert!(st.fixer_nt.contains("A_2"));
350        st.remove_nt_name(3);
351        assert!(!st.nt.contains(&"A_2".to_string()));
352        #[cfg(any())]
353        assert!(!st.names.contains_key("A_2"));
354        assert!(!st.fixer_nt.contains("A_2"));
355    }
356
357    #[test]
358    fn terminals() {
359        let mut st = SymbolTable::new();
360        let tnames = vec![("a", Some("aa")), ("b", Some("bb")), ("a", Some("A")), ("c", None), ("d", None)];
361        st.extend_terminals(tnames);
362        assert_eq!(st.get_num_t(), 5);
363        let result = st.get_terminals().map(|(s, v)| (s.as_str(), v.as_ref().map(|s| s.as_str()))).to_vec();
364        assert_eq!(result, vec![("a", Some("aa")), ("b", Some("bb")), ("a1", Some("A")), ("c", None), ("d", None)]);
365        for name in vec!["a1", "c", "d"] {
366            assert_eq!(st.fixer_t.contains(name), true);
367        }
368        st.downsize_num_t(2);
369        let result = st.get_terminals().map(|(s, v)| (s.as_str(), v.as_ref().map(|s| s.as_str()))).to_vec();
370        assert_eq!(result, vec![("a", Some("aa")), ("b", Some("bb"))]);
371        assert_eq!(st.get_num_t(), 2);
372        for name in vec!["a1", "c", "d"] {
373            assert_eq!(st.fixer_t.contains(name), false);
374        }
375        let extra_tnames = vec![("a", Some("A")), ("c", None), ("d", None)];
376        st.extend_terminals(extra_tnames);
377        let result = st.get_terminals().map(|(s, v)| (s.as_str(), v.as_ref().map(|s| s.as_str()))).to_vec();
378        assert_eq!(result, vec![("a", Some("aa")), ("b", Some("bb")), ("a1", Some("A")), ("c", None), ("d", None)]);
379    }
380}