Skip to main content

parol/generators/
scanner_config.rs

1use crate::{GrammarConfig, parser::parol_grammar::ScannerStateSwitch};
2use anyhow::{Result, bail};
3use parol_runtime::{
4    TerminalIndex,
5    lexer::{
6        BLOCK_COMMENT, ERROR_TOKEN, FIRST_USER_TOKEN, LINE_COMMENT, NEW_LINE, NEW_LINE_TOKEN,
7        WHITESPACE, WHITESPACE_TOKEN,
8    },
9};
10use std::fmt::{Debug, Display, Error, Formatter};
11
12// Regular expression + terminal index + optional lookahead expression + generated token name
13type TerminalMapping = (String, TerminalIndex, Option<(bool, String)>, String);
14// Scanner transition is a tuple of terminal index and the name of the next scanner mode
15type ScannerTransition = (TerminalIndex, ScannerStateSwitch);
16// The build information is a tuple of terminal mappings and scanner transitions
17type BuildInformation = (Vec<TerminalMapping>, Vec<ScannerTransition>);
18
19// ---------------------------------------------------
20// Part of the Public API
21// *Changes will affect crate's version according to semver*
22// ---------------------------------------------------
23///
24/// Configuration information for a scanner.
25/// Contains features like to optionally switch automatic handling off and newlines off.
26///
27#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
28pub struct ScannerConfig {
29    ///
30    /// The name of the scanner state taken from the grammar description
31    ///
32    pub scanner_name: String,
33
34    ///
35    /// Index of the scanner, aka scanner state
36    ///
37    pub scanner_state: usize,
38
39    ///
40    /// Strings with the characters that starts line comments
41    ///
42    pub line_comments: Vec<String>,
43
44    ///
45    /// (String, String) tuples with the characters that start and end
46    /// a block comments, respectively.
47    ///
48    pub block_comments: Vec<(String, String)>,
49
50    ///
51    /// If true the lexer handles (and skips) newlines.
52    /// If false the user has to handle newlines on its own.
53    ///
54    pub auto_newline: bool,
55
56    ///
57    /// If true the lexer handles (and skips) whitespace.
58    /// If false the user has to handle whitespace on its own.
59    ///
60    pub auto_ws: bool,
61
62    /// If true, unmatched input is allowed without error.
63    pub allow_unmatched: bool,
64
65    /// Scanner state transitions
66    /// Maps from token to scanner state, where the token is identified by its TerminalIndex
67    /// The scanner state is identified by its index.
68    pub transitions: Vec<(TerminalIndex, ScannerStateSwitch)>,
69}
70
71impl ScannerConfig {
72    /// Creates a new item
73    pub fn new(scanner_name: String, scanner_state: usize) -> Self {
74        Self {
75            scanner_name,
76            scanner_state,
77            line_comments: Vec::new(),
78            block_comments: Vec::new(),
79            auto_newline: true,
80            auto_ws: true,
81            allow_unmatched: false,
82            transitions: Vec::new(),
83        }
84    }
85
86    /// Adds line comments to self
87    pub fn with_line_comments(mut self, line_comments: Vec<String>) -> Self {
88        self.line_comments = line_comments;
89        self
90    }
91
92    /// Adds block comments to self
93    pub fn with_block_comments(mut self, block_comments: Vec<(String, String)>) -> Self {
94        self.block_comments = block_comments;
95        self
96    }
97
98    /// Sets auto newline behavior
99    pub fn with_auto_newline(mut self, auto_newline: bool) -> Self {
100        self.auto_newline = auto_newline;
101        self
102    }
103
104    /// Sets auto whitespace behavior
105    pub fn with_auto_ws(mut self, auto_ws: bool) -> Self {
106        self.auto_ws = auto_ws;
107        self
108    }
109
110    /// Sets allow unmatched behavior
111    pub fn with_allow_unmatched(mut self, allow_unmatched: bool) -> Self {
112        self.allow_unmatched = allow_unmatched;
113        self
114    }
115
116    /// Generates the data needed by the lexer generator.
117    /// The tuple contains the mapping of terminal strings to their indices plus an optional
118    /// lookahead pattern and the transitions, i.e. a mapping of terminal indices to scanner names.
119    ///
120    pub fn generate_build_information(
121        &self,
122        grammar_config: &GrammarConfig,
123        terminal_names: &[String],
124    ) -> Result<BuildInformation> {
125        let cfg = &grammar_config.cfg;
126        let mut terminal_mappings = Vec::new();
127        if self.auto_newline {
128            terminal_mappings.push((
129                NEW_LINE_TOKEN.to_owned(),
130                NEW_LINE,
131                None,
132                terminal_names[NEW_LINE as usize].clone(),
133            ));
134        }
135        if self.auto_ws {
136            terminal_mappings.push((
137                WHITESPACE_TOKEN.to_owned(),
138                WHITESPACE,
139                None,
140                terminal_names[WHITESPACE as usize].clone(),
141            ));
142        }
143        if !self.line_comments.is_empty() {
144            let line_comments_rx = self
145                .line_comments
146                .iter()
147                .map(|s| format!(r###"{s}.*(\r\n|\r|\n)?"###))
148                .collect::<Vec<String>>()
149                .join("|");
150            terminal_mappings.push((
151                line_comments_rx,
152                LINE_COMMENT,
153                None,
154                terminal_names[LINE_COMMENT as usize].clone(),
155            ));
156        }
157        if !self.block_comments.is_empty() {
158            let block_comments_rx = self
159                .block_comments
160                .iter()
161                .map(|(s, e)| Self::format_block_comment(s, e))
162                .collect::<Result<Vec<String>>>()?
163                .join("|");
164            terminal_mappings.push((
165                block_comments_rx,
166                BLOCK_COMMENT,
167                None,
168                terminal_names[BLOCK_COMMENT as usize].clone(),
169            ));
170        }
171
172        let mut terminal_mappings = cfg.get_ordered_terminals().iter().enumerate().fold(
173            terminal_mappings,
174            |mut acc, (i, (t, k, l, s))| {
175                if s.contains(&self.scanner_state) {
176                    acc.push((
177                        k.expand(t),
178                        i as TerminalIndex + FIRST_USER_TOKEN,
179                        l.as_ref()
180                            .map(|l| (l.is_positive, l.kind.expand(&l.pattern))),
181                        terminal_names[i + FIRST_USER_TOKEN as usize].clone(),
182                    ));
183                }
184                acc
185            },
186        );
187        // Add the error token as last terminal of the mode, unless allow_unmatched is set
188        if !self.allow_unmatched {
189            let error_index = terminal_names.len() - 1;
190            terminal_mappings.push((
191                ERROR_TOKEN.to_owned(),
192                error_index as TerminalIndex,
193                None,
194                terminal_names[error_index].clone(),
195            ));
196        }
197
198        Ok((terminal_mappings, self.transitions.clone()))
199    }
200
201    /// Formats a block comment
202    /// The block comment is formatted as a regular expression.
203    /// We need to specify the repeated expression for the comment content in such a way that
204    /// the end of the comment is not matched.
205    /// For this we need to allow only sequences that do not start with a substring of the end
206    /// of the comment. Since the end comment can be any string, we need to build an alternation
207    /// of all possible substrings of the end comment.
208    /// If the comment end is "*/" the regular expression is:
209    /// `r"/\*([^*]|\*[^/])*\*/"`
210    fn format_block_comment(s: &str, e: &str) -> Result<String> {
211        // Special case for /* ... */ block comments
212        if s == r"/\*" && e == r"\*/" {
213            // Use improved regex to match /***/ and similar cases
214            return Ok(r"/\*/?([^/]|[^*]/)*\*/".to_string());
215        }
216        let len_with_escaped_chars = |s: &str| {
217            let mut prev = None;
218            s.chars()
219                .map(|c| {
220                    if c == '\\' && !matches!(prev, Some('\\')) {
221                        prev = Some(c);
222                        0
223                    } else {
224                        prev = Some(c);
225                        1
226                    }
227                })
228                .sum::<usize>()
229        };
230        Ok(match len_with_escaped_chars(e) {
231            0 => bail!("Block comment end is empty."),
232            1 => {
233                let c0 = if e.chars().nth(0).unwrap() == '\\' {
234                    if Self::must_escape_in_bracketed_expression(e.chars().nth(1).unwrap()) {
235                        e.to_string()
236                    } else {
237                        e.chars().nth(1).unwrap().escape_default().to_string()
238                    }
239                } else {
240                    e.to_string()
241                };
242                format!(r"{s}[^{c0}]*{e}")
243            }
244            2 => {
245                let (c0, c1) = if e.chars().nth(0).unwrap() == '\\' {
246                    (&e[0..2], &e[2..])
247                } else {
248                    (&e[0..1], &e[1..])
249                };
250                // We need to determine if the character is escaped or not, and if it is escaped
251                // whether it is a regex meta character or not.
252                // If it is a regex meta character we don't need to escape it in a bracket expression.
253                let c0c = if c0.len() > 1 {
254                    debug_assert_eq!(c0.chars().nth(0).unwrap(), '\\');
255                    // Determine if the character after the escape is a regex meta character
256                    if Self::must_escape_in_bracketed_expression(c0.chars().nth(1).unwrap()) {
257                        c0.to_string()
258                    } else {
259                        c0.chars().nth(1).unwrap().escape_default().to_string()
260                    }
261                } else {
262                    debug_assert_eq!(c0.len(), 1);
263                    c0.to_string()
264                };
265                let c1c = if c1.len() > 1 {
266                    debug_assert_eq!(c1.chars().nth(0).unwrap(), '\\');
267                    // Determine if the character after the escape is a regex meta character
268                    if Self::must_escape_in_bracketed_expression(c1.chars().nth(1).unwrap()) {
269                        c1.to_string()
270                    } else {
271                        c1.chars().nth(1).unwrap().escape_default().to_string()
272                    }
273                } else {
274                    debug_assert_eq!(c1.len(), 1);
275                    c1.to_string()
276                };
277                format!(r"{s}([^{c0c}]|{c0}[^{c1c}])*{e}")
278            }
279            _ => bail!(
280                r"Block comment end '{}' is too long. Maximum length is 2.
281                Consider using manual comment handling, maybe with different scanner modes.",
282                e
283            ),
284        })
285    }
286
287    fn must_escape_in_bracketed_expression(c: char) -> bool {
288        matches!(c, '-' | ']' | '^' | '\\')
289    }
290}
291
292impl Default for ScannerConfig {
293    fn default() -> Self {
294        Self {
295            scanner_name: "INITIAL".to_string(),
296            scanner_state: 0,
297            line_comments: Vec::new(),
298            block_comments: Vec::new(),
299            auto_newline: true,
300            auto_ws: true,
301            allow_unmatched: false,
302            transitions: Vec::new(),
303        }
304    }
305}
306
307impl Display for ScannerConfig {
308    fn fmt(&self, f: &mut Formatter<'_>) -> std::result::Result<(), Error> {
309        writeln!(f, "scanner_name: {}", self.scanner_name)?;
310        writeln!(f, "scanner_state: {}", self.scanner_state)?;
311        writeln!(f, "line_comments: {:?}", self.line_comments)?;
312        writeln!(f, "block_comments: {:?}", self.block_comments)?;
313        writeln!(f, "auto_newline: {:?}", self.auto_newline)?;
314        writeln!(f, "auto_ws: {:?}", self.auto_ws)?;
315        self.transitions
316            .iter()
317            .try_for_each(|(k, v)| write!(f, "on {k} enter {v};"))
318    }
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    use scnr2::scanner;
326
327    fn format_matches(expected: &[scnr2::Match], input: &str) -> String {
328        format!(
329            "[{}]",
330            expected
331                .iter()
332                .map(|m| format!(
333                    "(\"{}\", {}, {})",
334                    &input[m.span.start..m.span.end],
335                    m.span.start,
336                    m.span.end
337                ))
338                .collect::<Vec<_>>()
339                .join(", ")
340        )
341    }
342
343    /// To help type inference in the macro
344    fn format_expected_matches(expected: &[(&str, usize, usize)]) -> String {
345        format!("{expected:?}")
346    }
347
348    macro_rules! scan_test {
349        ($test_name:ident, $module:ident, $scanner:ident, $pattern:expr, $input:expr, $expected:expr, $test_num:expr) => {
350            scanner! {
351                $scanner {
352                    mode M {
353                        token $pattern => 0;
354                    }
355                }
356            }
357            #[test]
358            fn $test_name() {
359                use $module::$scanner as S;
360                let scanner = S::new();
361                let matches = scanner.find_matches($input, 0).collect::<Vec<_>>();
362                const EXPECTED_MATCHES: &[(&str, usize, usize)] = $expected;
363                assert_eq!(
364                    matches.len(),
365                    EXPECTED_MATCHES.len(),
366                    "{}: Unexpected match count exp: {:?}, act: {:?}",
367                    $test_num,
368                    format_expected_matches(&EXPECTED_MATCHES),
369                    format_matches(&matches, $input)
370                );
371                for (i, ma) in EXPECTED_MATCHES.iter().enumerate() {
372                    assert_eq!(
373                        matches[i].span.start, ma.1,
374                        concat!($test_num, ": Match start does not match")
375                    );
376                    assert_eq!(
377                        matches[i].span.end, ma.2,
378                        concat!($test_num, ": Match end does not match")
379                    );
380                    assert_eq!(
381                        &($input)[ma.1..ma.2],
382                        ma.0,
383                        concat!($test_num, ": Matched substring does not match expected")
384                    );
385                }
386            }
387        };
388    }
389
390    #[test]
391    fn test_format_block_comment() {
392        let s = r"/\*";
393        let e = r"\*/";
394        let r = ScannerConfig::format_block_comment(s, e);
395        assert_eq!(r.unwrap(), r"/\*/?([^/]|[^*]/)*\*/");
396
397        let s = r"\{\{";
398        let e = r"\}\}";
399        let r = ScannerConfig::format_block_comment(s, e);
400        assert_eq!(r.unwrap(), r"\{\{([^}]|\}[^}])*\}\}");
401
402        let s = "--";
403        let e = "--";
404        let r = ScannerConfig::format_block_comment(s, e);
405        assert_eq!(r.unwrap(), r"--([^-]|-[^-])*--");
406
407        let s = "#";
408        let e = "#";
409        let r = ScannerConfig::format_block_comment(s, e);
410        assert_eq!(r.unwrap(), r"#[^#]*#");
411
412        let s = r"\{";
413        let e = r"\}";
414        let r = ScannerConfig::format_block_comment(s, e);
415        assert_eq!(r.unwrap(), r"\{[^}]*\}");
416    }
417
418    scan_test!(
419        test_block_comment_1,
420        scanner1,
421        Scanner1,
422        r"/\*/?([^/]|[^*]/)*\*/",
423        "code /* comment */ more code",
424        &[("/* comment */", 5, 18)],
425        "Test 1: Simple block comment"
426    );
427
428    scan_test!(
429        test_block_comment_2,
430        scanner2,
431        Scanner2,
432        r"/\*/?([^/]|[^*]/)*\*/",
433        "code /***/ more code /* comment */ /* com*ment */",
434        &[
435            ("/***/", 5, 10),
436            ("/* comment */", 21, 34),
437            ("/* com*ment */", 35, 49)
438        ],
439        "Test 2: Multiple block comments with stars inside"
440    );
441
442    // Tests for issue #828 - Edge cases with block comment parsing
443    scan_test!(
444        test_block_comment_empty,
445        scanner3,
446        Scanner3,
447        r"/\*/?([^/]|[^*]/)*\*/",
448        "code /**/ more code",
449        &[("/**/", 5, 9)],
450        "Test 3: Empty block comment"
451    );
452
453    scan_test!(
454        test_block_comment_triple_star,
455        scanner4,
456        Scanner4,
457        r"/\*/?([^/]|[^*]/)*\*/",
458        "code /****/ more code",
459        &[("/****/", 5, 11)],
460        "Test 4: Triple star comment"
461    );
462
463    scan_test!(
464        test_block_comment_start_end_token,
465        scanner5,
466        Scanner5,
467        r"/\*/?([^/]|[^*]/)*\*/",
468        "code /***/ more code",
469        &[("/***/", 5, 10)],
470        "Test 5: Block comment with only start of end token"
471    );
472
473    scan_test!(
474        test_block_comment_regular_content,
475        scanner6,
476        Scanner6,
477        r"/\*/?([^/]|[^*]/)*\*/",
478        "/* normal comment */ /* another * comment */",
479        &[
480            ("/* normal comment */", 0, 20),
481            ("/* another * comment */", 21, 44)
482        ],
483        "Test 6: Regular block comments with content"
484    );
485
486    scan_test!(
487        test_block_comment_multiple_sequence,
488        scanner7,
489        Scanner7,
490        r"/\*/?([^/]|[^*]/)*\*/",
491        "/**/ /* a */ /****/ /* b*c */ /**/",
492        &[
493            ("/**/", 0, 4),
494            ("/* a */", 5, 12),
495            ("/****/", 13, 19),
496            ("/* b*c */", 20, 29),
497            ("/**/", 30, 34)
498        ],
499        "Test 7: Multiple block comments in sequence"
500    );
501
502    scan_test!(
503        test_block_comment_complex_edge_cases,
504        scanner8,
505        Scanner8,
506        r"/\*/?([^/]|[^*]/)*\*/",
507        "/*/ not end */ /* ** */ /***/",
508        &[
509            ("/*/ not end */", 0, 14),
510            ("/* ** */", 15, 23),
511            ("/***/", 24, 29)
512        ],
513        "Test 8: Complex edge cases with various star patterns"
514    );
515
516    scan_test!(
517        test_block_comment_complex_edge_cases_different_delimiters,
518        scanner9,
519        Scanner9,
520        r"\{\{([^}]|\}[^}])*\}\}",
521        "{{} not end }} {{ {} }} {{{{}}",
522        &[
523            ("{{} not end }}", 0, 14),
524            ("{{ {} }}", 15, 23),
525            ("{{{{}}", 24, 30)
526        ],
527        "Test 9: Complex edge cases with different block comment delimiters"
528    );
529}