rustemo_compiler/
settings.rs

1use clap::clap_derive::ArgEnum;
2use std::fs;
3
4use crate::table::TableType;
5use crate::{Error, Result};
6use std::path::{Path, PathBuf};
7
8use crate::generator::generate_parser;
9
10/// The parsing algorithm used
11#[derive(Debug, Default, Clone, ArgEnum)]
12pub enum ParserAlgo {
13    #[default]
14    LR,
15    GLR,
16}
17
18/// The lexer type used during parsing to break the input into tokens
19#[derive(Debug, Default, Clone, ArgEnum)]
20pub enum LexerType {
21    /// Default lexer if the input is `str` is based on string/regex recognizers
22    #[default]
23    Default,
24    /// The lexer will be supplied by the user
25    Custom,
26}
27
28/// The builder type used during parsing to construct the output
29#[derive(Debug, Default, Clone, ArgEnum)]
30pub enum BuilderType {
31    /// Default builder type constructs AST using inferred node types
32    #[default]
33    Default,
34    /// Generic builder generates CST where each node is `TreeNode`
35    Generic,
36    /// The builder is user provided
37    Custom,
38}
39
40/// Different generated parser table variants with different trade-offs
41#[derive(Debug, Default, Clone, ArgEnum)]
42pub enum GeneratorTableType {
43    /// Table is generated as nested static arrays
44    /// Access time should be relatively good but it produces
45    /// larger executables.
46    Arrays,
47    /// Table is an array of function pointers where functions contain match
48    /// expression for further disambiguation. Uses less statically allocated
49    /// storage but requires function call and pattern matching.
50    #[default]
51    Functions,
52}
53
54/// Provides parser settings information.
55///
56/// It is the main entry point in the parser generation process. It is meant to
57/// be used from the project `build.rs` script. See [tests crate `build.rs`
58/// script](https://github.com/igordejanovic/rustemo/blob/main/tests/build.rs)
59/// for examples of various configurations.
60///
61/// The first step is to create default `Settings` instance, do necessary
62/// configuration by calling methods in a builder (chain) style and, at the end,
63/// call the method to process the grammar, either by directly specifying the
64/// file or recursivelly processing the directory.
65///
66/// Most of these settings are also exposed through `rcomp` CLI tool so you can
67/// process grammar and generate parsers from the command line (or shell script)
68/// if you prefer.
69///
70/// You can read more in the [Rustemo book](https://www.igordejanovic.net/rustemo/)
71///
72/// ## Example
73///
74/// ```rust
75/// rustemo_compiler::Settings::new().parser_algo(ParserAlgo::GLR).process_crate_dir()
76/// ```
77#[derive(Debug, Clone)]
78pub struct Settings {
79    pub(crate) out_dir_root: Option<PathBuf>,
80    pub(crate) out_dir_actions_root: Option<PathBuf>,
81    pub(crate) root_dir: Option<PathBuf>,
82
83    pub(crate) prefer_shifts: bool,
84    pub(crate) prefer_shifts_over_empty: bool,
85    pub(crate) table_type: TableType,
86    pub(crate) parser_algo: ParserAlgo,
87    pub(crate) print_table: bool,
88    pub(crate) exclude: Vec<String>,
89    pub(crate) actions: bool,
90    pub(crate) notrace: bool,
91
92    pub(crate) lexer_type: LexerType,
93    pub(crate) builder_type: BuilderType,
94    pub(crate) builder_loc_info: bool,
95    pub(crate) generator_table_type: GeneratorTableType,
96    pub(crate) input_type: String,
97
98    pub(crate) lexical_disamb_most_specific: bool,
99    pub(crate) lexical_disamb_longest_match: bool,
100    pub(crate) lexical_disamb_grammar_order: bool,
101
102    pub(crate) partial_parse: bool,
103    pub(crate) skip_ws: bool,
104
105    pub(crate) force: bool,
106    force_explicit: bool,
107
108    pub(crate) dot: bool,
109    pub(crate) fancy_regex: bool,
110}
111
112impl Default for Settings {
113    fn default() -> Self {
114        // If called from cargo build use OUT_DIR as a default out_dir directory
115        // for both parser and actions.
116        let out_dir_root = std::env::var("OUT_DIR").map_or(None, |d| Some(PathBuf::from(d)));
117
118        // By default root dir is the root of the cargo project.
119        let root_dir =
120            std::env::var("CARGO_MANIFEST_DIR").map_or(None, |d| Some(PathBuf::from(d)));
121
122        Self {
123            root_dir,
124            out_dir_root: out_dir_root.clone(),
125            out_dir_actions_root: out_dir_root,
126            prefer_shifts: false,
127            prefer_shifts_over_empty: true,
128            table_type: Default::default(),
129            parser_algo: Default::default(),
130            print_table: false,
131            actions: true,
132            notrace: false,
133            lexer_type: Default::default(),
134            builder_type: Default::default(),
135            builder_loc_info: false,
136            generator_table_type: Default::default(),
137            input_type: "str".into(),
138            lexical_disamb_most_specific: true,
139            lexical_disamb_longest_match: true,
140            lexical_disamb_grammar_order: true,
141            partial_parse: false,
142            skip_ws: true,
143            force: true, // Overwriting actions by default
144            force_explicit: false,
145            exclude: vec![],
146            dot: false,
147            fancy_regex: false,
148        }
149    }
150}
151
152impl Settings {
153    /// Creates a default instance.
154    pub fn new() -> Self {
155        Settings::default()
156    }
157
158    /// Root dir used to calculate output file path from the input grammar path
159    /// when the `out_dir_root` is not `None`.
160    /// It can be overridden explicitly or when using `process_dir` call.
161    /// It is an error if `root_dir` is `None`, `our_dir_root` is set and
162    /// `CARGO_MANIFEST_DIR` env variable cannot be found.
163    pub fn root_dir(mut self, root_dir: PathBuf) -> Self {
164        self.root_dir = Some(root_dir);
165        self
166    }
167
168    /// Sets output root for the generated parser. By default, the parser is
169    /// generated in the source tree next to the grammar.
170    pub fn out_dir_root(mut self, out_dir: PathBuf) -> Self {
171        self.out_dir_root = Some(out_dir);
172        self
173    }
174
175    /// Output root for the generated actions when default builder is used. By
176    /// default, actions are generated in the source tree next to the grammar.
177    pub fn out_dir_actions_root(mut self, out_dir: PathBuf) -> Self {
178        self.out_dir_actions_root = Some(out_dir);
179        self
180    }
181
182    /// Generate both parser and actions (for default builder) in the source
183    /// tree, next to the grammar. By default, parser and actions are generated
184    /// in out `OUT_DIR`.
185    pub fn in_source_tree(mut self) -> Self {
186        self.out_dir_root = None;
187        if matches!(self.builder_type, BuilderType::Default) {
188            self.actions_in_source_tree()
189        } else {
190            self
191        }
192    }
193
194    /// Generate actions in the source tree (if the default builder is used),
195    /// next to the grammar. By default, actions are generated in out `OUT_DIR`.
196    pub fn actions_in_source_tree(mut self) -> Self {
197        if !matches!(self.builder_type, BuilderType::Default) {
198            panic!("Settings 'actions_in_source_tree' is only available for the default builder type!");
199        }
200        self.out_dir_actions_root = None;
201        if !self.force_explicit {
202            self.force = false;
203        }
204        self
205    }
206
207    /// Excludes path from processing. If path contains any of the string given
208    /// in `exclude` vector it will be skipped.
209    pub fn exclude(mut self, exclude: Vec<String>) -> Self {
210        self.exclude = exclude;
211        self
212    }
213
214    /// When there are competing REDUCE and SHIFT operations, this settings will
215    /// always favor SHIFT.
216    pub fn prefer_shifts(mut self, prefer: bool) -> Self {
217        self.prefer_shifts = prefer;
218        self
219    }
220
221    /// When there are competing EMPTY reduction and SHIFT, this settings will
222    /// always favor SHIFT.
223    pub fn prefer_shifts_over_empty(mut self, prefer: bool) -> Self {
224        self.prefer_shifts_over_empty = prefer;
225        self
226    }
227
228    /// LR table type to construct.
229    pub fn table_type(mut self, table_type: TableType) -> Self {
230        self.table_type = table_type;
231        self
232    }
233
234    /// LR algorithm to use
235    pub fn parser_algo(mut self, parser_algo: ParserAlgo) -> Self {
236        match parser_algo {
237            ParserAlgo::LR => {}
238            ParserAlgo::GLR => {
239                // For GLR we are using RN tables
240                self.table_type = TableType::LALR_RN;
241                // For GLR we should not favour shifts at all
242                self.prefer_shifts = false;
243                self.prefer_shifts_over_empty = false;
244                // We don't use grammar order by default
245                self.lexical_disamb_grammar_order = false;
246            }
247        }
248        self.parser_algo = parser_algo;
249        self
250    }
251
252    /// Sets lexer type. Default lexer is used for string inputs and is based on
253    /// regex/string matches from the grammar.
254    pub fn lexer_type(mut self, lexer_type: LexerType) -> Self {
255        self.lexer_type = lexer_type;
256        self
257    }
258
259    /// Sets builder type. The default builder will deduce AST types and actions.
260    pub fn builder_type(mut self, builder_type: BuilderType) -> Self {
261        self.builder_type = builder_type;
262        self
263    }
264
265    /// Should generated default AST builder types contain location/layout information
266    /// This is only used if builder-type is default.
267    pub fn builder_loc_info(mut self, builder_loc_info: bool) -> Self {
268        self.builder_loc_info = builder_loc_info;
269        self
270    }
271
272    /// Sets generator table type. The default is nested static arrays.
273    pub fn generator_table_type(mut self, generator_table_type: GeneratorTableType) -> Self {
274        self.generator_table_type = generator_table_type;
275        self
276    }
277
278    /// Sets the input type. Default is `str`
279    pub fn input_type(mut self, input_type: String) -> Self {
280        self.input_type = input_type;
281        self
282    }
283
284    /// Lexical disambiguation using most specific match strategy.
285    pub fn lexical_disamb_most_specific(mut self, most_specific: bool) -> Self {
286        self.lexical_disamb_most_specific = most_specific;
287        self
288    }
289
290    /// Lexical disambiguation using longest match strategy.
291    pub fn lexical_disamb_longest_match(mut self, longest_match: bool) -> Self {
292        self.lexical_disamb_longest_match = longest_match;
293        self
294    }
295
296    /// Lexical disambiguation using grammar order.
297    pub fn lexical_disamb_grammar_order(mut self, grammar_order: bool) -> Self {
298        if let ParserAlgo::LR = self.parser_algo {
299            if !grammar_order {
300                panic!("Can't disable grammar order strategy for LR.")
301            }
302        }
303        self.lexical_disamb_grammar_order = grammar_order;
304        self
305    }
306
307    /// Set whether or not we use [`fancy_regex`](https://docs.rs/fancy-regex/latest/fancy_regex/)
308    /// instead of [`regex`](https://docs.rs/regex/latest/regex/)
309    pub fn fancy_regex(mut self, fancy_regex: bool) -> Self {
310        self.fancy_regex = fancy_regex;
311        self
312    }
313
314    pub fn print_table(mut self, print_table: bool) -> Self {
315        self.print_table = print_table;
316        self
317    }
318
319    /// If partial parse is allowed parsing can succeed even if the parser
320    /// didn't reach the end of the input. Use with care, especially with GLR
321    /// parsing as it may lead to a large number of partial solutions.
322    pub fn partial_parse(mut self, partial_parse: bool) -> Self {
323        self.partial_parse = partial_parse;
324        self
325    }
326
327    /// Should whitespaces be skipped. `true` by default. Not used if Layout
328    /// rule exists in the Grammar. Used only in the default lexer.
329    pub fn skip_ws(mut self, skip_ws: bool) -> Self {
330        self.skip_ws = skip_ws;
331        self
332    }
333
334    /// Should actions be generated. `true` by default. Used only if default
335    /// builder is used.
336    pub fn actions(mut self, actions: bool) -> Self {
337        self.actions = actions;
338        self
339    }
340
341    /// Should trace log be printed. `false` by default. Does nothing for
342    /// release builds as trace is only available in debug build. Can also be
343    /// set by `RUSTEMO_NOTRACE=1` env variable.
344    pub fn notrace(mut self, notrace: bool) -> Self {
345        let notrace = if !notrace {
346            std::env::var("RUSTEMO_NOTRACE").is_ok()
347        } else {
348            std::env::set_var("RUSTEMO_NOTRACE", "1");
349            true
350        };
351
352        self.notrace = notrace;
353        self
354    }
355
356    /// Should actions file be recreated if exist. Use with care.
357    pub fn force(mut self, force: bool) -> Self {
358        self.force = force;
359        self.force_explicit = true;
360        self
361    }
362
363    /// If this is set a .dot file with automata visualization will be produced during
364    /// compiling.
365    pub fn dot(mut self, dot: bool) -> Self {
366        self.dot = dot;
367        self
368    }
369
370    /// Recursively traverse the root dir and process each Rustemo grammar found.
371    /// Used as the last call to the configured [Settings] value.
372    pub fn process_dir(&self) -> Result<()> {
373        if let Some(root_dir) = &self.root_dir {
374            if !root_dir.exists() {
375                return Err(Error::Error(format!(
376                    "Directory/File {root_dir:?} doesn't exist."
377                )));
378            }
379
380            let visitor = |grammar: &Path| -> Result<()> {
381                self.process_grammar(grammar)?;
382                Ok(())
383            };
384
385            self.visit_dirs(root_dir, &visitor)
386        } else {
387            Err(Error::Error("Root dir must be set!".to_string()))
388        }
389    }
390
391    /// Process the given grammar and generates the parser and actions (if
392    /// default builder is used). Used as the last call to the configured
393    /// [Settings] value.
394    pub fn process_grammar(&self, grammar: &Path) -> Result<()> {
395        println!("Generating parser for grammar {:?}", grammar);
396        let relative_outdir = |p: &Path| -> Result<PathBuf> {
397            Ok(p.join(
398                grammar
399                    .parent()
400                    .ok_or(Error::Error(
401                        "Cannot find parent of '{grammar:?}' file.".to_string(),
402                    ))?
403                    .strip_prefix(self.root_dir.as_ref().expect("'root_dir' must be set!"))
404                    .or(Err(Error::Error(
405                        "Cannot remove prefix '{root_dir:?}' from '{grammar:?}'.".to_string(),
406                    )))?,
407            ))
408        };
409
410        let out_dir = self
411            .out_dir_root
412            .as_ref()
413            .map(|p| relative_outdir(p))
414            .transpose()?;
415
416        let out_dir_actions = self
417            .out_dir_actions_root
418            .as_ref()
419            .map(|p| relative_outdir(p))
420            .transpose()?;
421
422        if let Some(ref dir) = out_dir {
423            println!("Parser out dir: {dir:?}");
424        }
425        if let Some(ref dir) = out_dir_actions {
426            println!("Actions out dir: {dir:?}");
427        }
428
429        generate_parser(
430            grammar,
431            out_dir.as_deref(),
432            out_dir_actions.as_deref(),
433            self,
434        )
435    }
436
437    /// Recursively visits dirs starting from the given `dir` and calls
438    /// `visitor` for each Rustemo grammar found.
439    fn visit_dirs(&self, dir: &Path, visitor: &dyn Fn(&Path) -> Result<()>) -> Result<()> {
440        if dir.is_dir() {
441            for entry in fs::read_dir(dir)? {
442                let entry = entry?;
443                let path = entry.path();
444
445                // Check excluded paths
446                let path_name = path.to_string_lossy();
447                if self.exclude.iter().any(|e| path_name.contains(e)) {
448                    println!("Excluding path: {path_name:?}");
449                    continue;
450                }
451
452                if path.is_dir() {
453                    self.visit_dirs(&path, visitor)?;
454                } else if matches!(path.extension(), Some(ext) if ext == "rustemo") {
455                    visitor(&path)?
456                }
457            }
458        }
459        Ok(())
460    }
461}
462
463/// Recursively process a given dir and generate a parser for each found
464/// grammar with default settings.
465///
466/// # Example
467///
468/// ```rust
469/// rustemo_compiler::process_dir("~/my_project")
470/// ```
471///
472/// # Errors
473///
474/// In case of an error a value of [rustemo::Error] is returned.
475pub fn process_dir<P: AsRef<Path>>(dir: P) -> Result<()> {
476    Settings::new()
477        .root_dir(PathBuf::from(dir.as_ref()))
478        .process_dir()?;
479    Ok(())
480}
481
482/// A shortcut function which creates default [Settings] and use it to process
483/// the crate project directory.
484pub fn process_crate_dir() -> Result<()> {
485    Settings::new().process_dir()?;
486    Ok(())
487}
488
489/// Generates a parser from the given grammar file with default settings.
490///
491/// # Errors
492///
493/// In case of an error a value of [rustemo::Error] is returned.
494pub fn process_grammar<P: AsRef<Path>>(grammar: P) -> Result<()> {
495    Settings::new().process_grammar(grammar.as_ref())?;
496    Ok(())
497}