syntax_parser_generator/parsing/translator/
build.rs

1use std::collections::HashMap;
2
3use crate::handles::{Handle, Handled};
4use crate::handles::collections::{HandledVec, HandleMap};
5use crate::handles::specials::AutomaticallyHandled;
6use crate::parsing::lr_parser::build::LrParserBuilder;
7use crate::parsing::lr_parser::rules::{Associativity, Binding, GrammarSymbol};
8use crate::parsing::translator::handlers::{LeafSatelliteBuilder, SatelliteReducer};
9use crate::parsing::translator::sdt::SyntaxDirectedTranslator;
10
11/// An interface for specifying and compiling a [SyntaxDirectedTranslator].
12///
13/// An instance of this type serve as a "contractor" for building a LALR parser. It exports an API
14/// to communicate the specifications of the parser, and when you're done - its
15/// [build](SyntaxDirectedTranslatorBuilder::build) can be used to compile a matching instance of
16/// [SyntaxDirectedTranslator].
17///
18/// # Features
19///
20/// ## Bindings
21///
22/// A group of terminal symbols (i.e. lexeme types) can be associated with a _binding_, which can
23/// then be attached to the grammar's production rules. Bindings are used to resolve shift-reduce
24/// conflicts: when deciding between shifting by a bound terminal or reducing by a bound rule,
25/// the one with higher priority binding is selected. If both have the same binding - we reduce if
26/// it is left-associated, and shift if it is right-associated.
27///
28/// This can be used to implement associativity and precedence of operators. For example - one
29/// left-associated higher-priority binding for multiplication and division, and another
30/// left-associated lower-priority binding for addition and subtraction.
31///
32/// ## Dubs
33///
34/// To make the parser's programmatic specification cleaner, the notion of _dubs_ was introduced:
35/// instead of referring to a terminal, nonterminal, or a binding, by some complex Rust object,
36/// these are referred to by their "dubs" - character strings that identify them.
37///
38/// ## Translation Scheme Specifications
39///
40/// The compiled parser will be capable of translating a given sequence of lexemes into some
41/// higher-level representation (AST, IR sequence, etc.). We refer to this higher level
42/// representation as _satellite data_ (as it escorts subtrees of the input's syntax tree). Client
43/// code is free to determine this target representation and the translation logic.
44///
45/// This translation logic is carried out recursively: a subtree of the input syntax tree is
46/// translated into satellite data as a function of the translations of its own subtrees. We call
47/// such translation functions _satellite reducers_, as they execute after each reduction of
48/// grammar symbols by the underlying LR parser. At the leaves, lexemes are directly translated to
49/// satellite data using dedicated functions we call _leaf satellite builders_.
50///
51/// All of these translation functions receive a mutable reference to the _translation context_ as
52/// an additional argument, which is a user-defined object that's used to hold global knowledge
53/// about the translated input, such as symbol tables and general statistics.
54///
55/// All of this translation logic is of course user-defined, providing client code with great
56/// flexibility when building custom syntax-directed translators.
57///
58/// # Example
59///
60/// Check out the example at [parsing](crate::parsing).
61///
62/// # Conflict Resolution Policy
63///
64/// * Reduce-reduce conflicts cannot be resolved.
65/// * Shift-reduce conflicts are resolved with "shift" by default, unless otherwise specified by
66///     bindings.
67pub struct SyntaxDirectedTranslatorBuilder<LexemeType, Context, Satellite>
68where
69    LexemeType: AutomaticallyHandled,
70{
71    grammar_symbol_dub_map: HashMap<String, GrammarSymbol<LexemeType, Nonterminal>>,
72    nonterminals: HandledVec<Nonterminal>,
73    lr_parser_builder:
74        LrParserBuilder<LexemeType, Nonterminal, SatelliteReducer<Context, Satellite>>,
75    bindings_dub_map: HashMap<String, Handle<Binding<LexemeType>>>,
76    leaf_satellite_builder_map: HandleMap<LexemeType, LeafSatelliteBuilder<Context, Satellite>>,
77    default_leaf_satellite_builder: Option<LeafSatelliteBuilder<Context, Satellite>>,
78    satellite_reducers: HandledVec<SatelliteReducer<Context, Satellite>>,
79}
80
81impl<LexemeType, Context, Satellite> SyntaxDirectedTranslatorBuilder<LexemeType, Context, Satellite>
82where
83    LexemeType: AutomaticallyHandled,
84{
85    /// Create a blank [SyntaxDirectedTranslatorBuilder], with no registered specifications.
86    pub fn new() -> Self {
87        Self {
88            grammar_symbol_dub_map: HashMap::new(),
89            nonterminals: HandledVec::new(),
90            lr_parser_builder: LrParserBuilder::new(),
91            bindings_dub_map: HashMap::new(),
92            leaf_satellite_builder_map: HandleMap::new(),
93            default_leaf_satellite_builder: None,
94            satellite_reducers: HandledVec::new(),
95        }
96    }
97
98    /// Dub with a category of lexemes.
99    ///
100    /// Remember: lexeme types serve as terminal symbols for the underlying LALR parser. This
101    /// function, in effect, dubs a terminal symbol in the grammar.
102    ///
103    /// # Panics
104    ///
105    /// If some other grammar symbol is already identified by this dub.
106    pub fn dub_lexeme_type(&mut self, lexeme_type: LexemeType, dub: &str) {
107        if self.grammar_symbol_dub_map.contains_key(dub) {
108            panic!(
109                "Tried to dub a lexeme type {:?}, which is already used to dub another \
110                grammar symbol",
111                dub,
112            )
113        }
114        self.grammar_symbol_dub_map.insert(
115            String::from(dub),
116            GrammarSymbol::Terminal(lexeme_type.handle()),
117        );
118    }
119
120    /// Dub multiple categories of lexemes at once.
121    ///
122    /// # Panics
123    ///
124    /// If a new dub is already used to dub another grammar symbol.
125    pub fn dub_lexeme_types<'a>(
126        &mut self,
127        lexeme_type_dubs: impl Iterator<Item=(LexemeType, &'a str)>,
128    ) {
129        for (lexeme_type, dub) in lexeme_type_dubs {
130            self.dub_lexeme_type(lexeme_type, dub);
131        }
132    }
133
134    /// Create a new nonterminal grammar symbol, associated with the given dub.
135    ///
136    /// # Panics
137    ///
138    /// If some other grammar symbol is already identified by this dub.
139    pub fn new_nonterminal(&mut self, dub: &str) {
140        if self.grammar_symbol_dub_map.contains_key(dub) {
141            panic!(
142                "Tried to create a new nonterminal dubbed {:?}, which is already used to dub \
143                another grammar symbol",
144                dub,
145            )
146        }
147        let nonterminal = self.nonterminals.insert(Nonterminal);
148        self.grammar_symbol_dub_map
149            .insert(String::from(dub), GrammarSymbol::Nonterminal(nonterminal));
150    }
151
152    /// Create multiple nonterminals at once, associated with the given dubs.
153    ///
154    /// # Panics
155    ///
156    /// If a new dub is already used to dub another grammar symbol.
157    pub fn new_nonterminals<'a>(&mut self, dubs: impl Iterator<Item=&'a str>) {
158        for dub in dubs {
159            self.new_nonterminal(dub);
160        }
161    }
162
163    /// Set the start nonterminal of the underlying grammar.
164    ///
165    /// The nonterminal is identified by its `dub`.
166    ///
167    /// # Panics
168    ///
169    /// If no nonterminal is associated with this dub.
170    pub fn set_start_nonterminal(&mut self, dub: &str) {
171        let nonterminal = match self.grammar_symbol_dub_map.get(dub) {
172            Some(GrammarSymbol::Nonterminal(nonterminal)) => *nonterminal,
173            _ => panic!(
174                "Tried to set start nonterminal to a non-existing nonterminal dubbed {:?}",
175                dub,
176            ),
177        };
178        self.lr_parser_builder.set_start_nonterminal(nonterminal);
179    }
180
181    /// Register a new binding.
182    ///
183    /// Note that the order of registration determines the priority of the bindings: bindings
184    /// registered earlier have higher priority over bindings registered later.
185    ///
186    /// # Arguments
187    /// * `bound_lexeme_types_dubs` - the dubs of the terminal grammar symbols (lexeme categories)
188    ///     that are bound to this binding.
189    /// * `associativity` - the binding's associativity.
190    /// * `binding_dub` - a dub, used to identify this binding.
191    ///
192    /// # Panics
193    ///
194    /// If the binding's dub is already used to dub another binding, or if no terminal symbol is
195    /// dubbed with one of the `bound_lexeme_types_dubs`.
196    pub fn new_binding(
197        &mut self,
198        bound_lexeme_types_dubs: Vec<&str>,
199        associativity: Associativity,
200        binding_dub: &str,
201    ) {
202        if self.bindings_dub_map.contains_key(binding_dub) {
203            panic!(
204                "Tried to create a new binding dubbed {:?}, which is already used to dub an \
205                existing binding",
206                binding_dub,
207            )
208        }
209        let terminals = bound_lexeme_types_dubs
210            .iter()
211            .map(
212                |&lexeme_type_dub| match self.grammar_symbol_dub_map.get(lexeme_type_dub) {
213                    Some(GrammarSymbol::Terminal(terminal)) => *terminal,
214                    _ => panic!(
215                        "Tried to create a binding dubbed {:?} , bound to a non-existing \
216                        lexeme type dubbed {:?}",
217                        binding_dub, lexeme_type_dub,
218                    ),
219                },
220            )
221            .collect();
222        self.bindings_dub_map.insert(
223            String::from(binding_dub),
224            self.lr_parser_builder
225                .register_binding(terminals, associativity),
226        );
227    }
228
229    /// Set the function that'll transform lexemes of a certain type into satellite data.
230    ///
231    /// When a lexeme of the category dubbed `lexeme_type_dub` will be encountered, `builder` will
232    /// be called with the lexeme's contents (alongside a reference to the translation context), and
233    /// the output will serve as the satellite data attached to this "terminal symbol".
234    ///
235    /// # Panics
236    ///
237    /// If `lexeme_type_dub` isn't a known lexeme-type dub.
238    pub fn set_leaf_satellite_builder<F>(&mut self, lexeme_type_dub: &str, builder: F)
239    where
240        F: Fn(&mut Context, String) -> Satellite + 'static,
241    {
242        let lexeme_type = match self.grammar_symbol_dub_map.get(lexeme_type_dub) {
243            Some(GrammarSymbol::Terminal(lexeme_type)) => *lexeme_type,
244            _ => panic!(
245                "Tried to set a leaf satellite builder for a non-existing lexeme type dubbed \
246                    {:?}",
247                lexeme_type_dub,
248            ),
249        };
250        self.leaf_satellite_builder_map
251            .insert(lexeme_type, Box::new(builder));
252    }
253
254    /// Set a leaf-satellite builder to process lexeme types for which no builder was set with
255    /// [SyntaxDirectedTranslatorBuilder::set_leaf_satellite_builder].
256    pub fn set_default_leaf_satellite_builder<F>(&mut self, builder: F)
257    where
258        F: Fn(&mut Context, String) -> Satellite + 'static,
259    {
260        self.default_leaf_satellite_builder = Some(Box::new(builder));
261    }
262
263    /// Add a production rule to the underlying context-free grammar.
264    ///
265    /// # Arguments
266    ///
267    /// * `lhs` - the dub of the nonterminal in the left-hand side of the production.
268    /// * `rhs` - a sequence of grammar-symbols dubs (terminals and nonterminals), that appear in
269    ///     the right-hand side of the production.
270    /// * `satellite_reducer` - when reducing by the new rule, this function will be called with the
271    ///     satellite data of the right-hand side grammar symbols (and the translation context), and
272    ///     its output will serve as the satellite data attached to the left-hand side nonterminal.
273    ///
274    /// # Panics
275    ///
276    /// If any the given dubs isn't known as a matching dub.
277    pub fn register_rule<F>(&mut self, lhs: &str, rhs: Vec<&str>, satellite_reducer: F)
278    where
279        F: Fn(&mut Context, Vec<Satellite>) -> Satellite + 'static,
280    {
281        self.register_rule_raw(lhs, rhs, None, satellite_reducer);
282    }
283
284    /// Register a production rule with an associated binding.
285    ///
286    /// The associated binding is identified by its dub - `binding_dub`. The rest of the arguments
287    /// are identical to [SyntaxDirectedTranslatorBuilder::register_rule].
288    ///
289    /// # Panics
290    ///
291    /// If any the given dubs isn't known as a matching dub.
292    pub fn register_bound_rule<F>(
293        &mut self,
294        lhs: &str,
295        rhs: Vec<&str>,
296        binding_dub: &str,
297        satellite_reducer: F,
298    ) where
299        F: Fn(&mut Context, Vec<Satellite>) -> Satellite + 'static,
300    {
301        self.register_rule_raw(lhs, rhs, Some(binding_dub), satellite_reducer);
302    }
303
304    /// Register a production rule for which the satellite reducer copies the satellite data from
305    /// its 1-length RHS to its LHS.
306    ///
307    /// This is commonly used for productions, such as `expression -> integer_literal`.
308    ///
309    /// # Arguments
310    ///
311    /// * `lhs` - the dub of the nonterminal in the left-hand side of the production.
312    /// * `rhs` - the dub of the only grammar-symbol in the right-hand side of the production.
313    ///
314    /// # Panics
315    ///
316    /// If any the given dubs isn't known as a matching dub.
317    pub fn register_identity_rule(&mut self, lhs: &str, rhs: &str) {
318        self.register_rule(lhs, vec![rhs], |_context, mut satellites| {
319            satellites.pop().expect(
320                "Tried to reduce satellites using `identity_satellite_reducer`, but the \
321                provided RHS satellite list is empty (it should contain a single satellite for the \
322                single item in the RHS)",
323            )
324        });
325    }
326
327    /// Register a production rule whose RHS is empty.
328    ///
329    /// This is commonly used with nonterminals that represent lists that can be empty. For
330    /// example `statements_list -> _nothing_`.
331    ///
332    /// # Arguments
333    ///
334    /// * `lhs` - the dub of the nonterminal in the left-hand side of the production.
335    /// * `default_satellite_builder` - when reducing by this rule, this will be called (with the
336    ///     translation context), and its output will serve as the satellite data attached to the
337    ///     left-hand side nonterminal.
338    ///
339    /// # Panics
340    ///
341    /// If `lhs` is not a known dub for a nonterminal.
342    pub fn register_empty_rule<F>(&mut self, lhs: &str, default_satellite_builder: F)
343    where
344        F: Fn(&mut Context) -> Satellite + 'static,
345    {
346        self.register_rule(lhs, vec![], move |context, _satellites| {
347            default_satellite_builder(context)
348        })
349    }
350
351    fn register_rule_raw<F>(
352        &mut self,
353        lhs_dub: &str,
354        rhs_dubs: Vec<&str>,
355        binding_dub: Option<&str>,
356        satellite_reducer: F,
357    ) where
358        F: Fn(&mut Context, Vec<Satellite>) -> Satellite + 'static,
359    {
360        let lhs = match self.grammar_symbol_dub_map.get(lhs_dub) {
361            Some(GrammarSymbol::Nonterminal(nonterminal)) => *nonterminal,
362            _ => panic!(
363                "Tried to register a production rule whose LHS is a non-existing nonterminal \
364                dubbed {:?}",
365                lhs_dub,
366            ),
367        };
368        let rhs = rhs_dubs
369            .iter()
370            .map(|&dub| match self.grammar_symbol_dub_map.get(dub) {
371                Some(&grammar_symbol) => grammar_symbol,
372                _ => panic!(
373                    "Tried to register a production rule whose RHS contains non-existing grammar \
374                symbol dubbed {:?}",
375                    dub,
376                ),
377            })
378            .collect();
379        let binding = binding_dub.map(|actual_binding_dub| {
380            match self.bindings_dub_map.get(actual_binding_dub) {
381                Some(&binding) => binding,
382                None => panic!(
383                    "Tried to register a production rule bound to a non-existing binding dubbed \
384                    {:?}",
385                    actual_binding_dub,
386                ),
387            }
388        });
389        let tag = self.satellite_reducers.insert(Box::new(satellite_reducer));
390        self.lr_parser_builder.register_rule(lhs, rhs, binding, tag);
391    }
392
393    /// Compile the set specifications into a functioning [SyntaxDirectedTranslator].
394    pub fn build(self) -> SyntaxDirectedTranslator<LexemeType, Context, Satellite> {
395        SyntaxDirectedTranslator {
396            lr_parser: self.lr_parser_builder.build(),
397            default_leaf_satellite_builder: self.default_leaf_satellite_builder,
398            leaf_satellite_builder_map: self.leaf_satellite_builder_map,
399            satellite_reducers: self.satellite_reducers,
400        }
401    }
402}
403
404// Blank, don't really need to carry any info, Handle API is only used for counting registrations
405pub struct Nonterminal;
406impl Handled for Nonterminal {
407    type HandleCoreType = u8;
408}