scnr/
scanner.rs

1use std::{fmt::Debug, path::Path};
2
3use log::trace;
4
5use crate::internal::ScannerImpl;
6
7use crate::{FindMatches, Result, ScannerMode};
8
9/// A trait to switch between scanner modes.
10///
11/// This trait is used to switch between different scanner modes from a parser's perspective.
12/// The parser can set the current scanner mode to switch to a different set of Finite State
13/// Machines, FSMs.
14/// Usually, the scanner mode is changed by the scanner itself based on the transitions defined
15/// in the active scanner mode.
16///
17/// It is discouraged to use a mix of scanner induced mode changes and parser induced mode changes.
18/// This can lead to unexpected behavior and is not recommended.
19///
20/// Only several kinds of parsers are able to handle mode changes as part of the grammar.
21/// Note that 'part of the grammar' means that the mode changes are part of the parser's state
22/// machine and not anything implemented in semantic actions.
23///
24/// For example, an LL parser is able to handle scanner mode switches in the grammar because it
25/// always 'knows' the next production to parse. If the production contains a scanner mode switch,
26/// the parser can switch the scanner mode before scanning the next token.
27///
28/// A LR parser is not able to handle mode changes in the grammar because it does not know the next
29/// production to parse. The parser has to decide which production to parse based on the lookahead
30/// tokens. If the lookahead tokens contain a token that needed a scanner mode switch, the parser
31/// is not able to switch the scanner mode before reading the next token.
32///
33/// An example of a parser induced mode changes is the `parol` parser generator. It is able to
34/// handle mode changes in the grammar because it generates LL parsers. The parser is able to switch
35/// the scanner mode before scanning the next token.
36/// `parol` is also able to handle scanner induced mode changes stored as transitions in the scanner
37/// modes. The scanner mode changes are then no part of the grammar but instead part of the scanner.
38///
39/// Furthermore `parol` can also generate LR parsers. In this case, the scanner mode changes are not
40/// part of the grammar but instead part of the scanner. `parol` will prevent the LR grammar from
41/// containing scanner mode changes.
42///
43/// See <https://github.com/jsinger67/parol> for more information about the `parol` parser
44/// generator.
45pub trait ScannerModeSwitcher {
46    /// Sets the current scanner mode.
47    fn set_mode(&mut self, mode: usize);
48    /// Returns the current scanner mode.
49    fn current_mode(&self) -> usize;
50    /// Returns the name of the scanner mode with the given index.
51    fn mode_name(&self, index: usize) -> Option<&str>;
52}
53
54/// A Scanner.
55/// It consists of multiple DFAs that are used to search for matches.
56///
57/// Each DFA corresponds to a scanner mode that can recognize the tokens that belongs to it.
58/// Scanner modes are known from Flex as *Start conditions* and provides more
59/// flexibility by defining several scanners for several parts of your grammar.
60/// See <https://www.cs.princeton.edu/~appel/modern/c/software/flex/flex.html#SEC11>
61/// for more information.
62///
63/// To create a scanner, you should use the `ScannerBuilder` to add scanner mode data.
64/// At least one scanner mode must be added to the scanner. This single mode is usually named
65/// `INITIAL`.
66#[derive(Debug)]
67pub struct Scanner {
68    pub(crate) inner: ScannerImpl,
69}
70
71impl Scanner {
72    /// Returns an iterator over all non-overlapping matches.
73    /// The iterator yields a [`crate::Match`] value until no more matches could be found.
74    pub fn find_iter<'h>(&self, input: &'h str) -> FindMatches<'h> {
75        FindMatches::new(self.inner.clone(), input)
76    }
77
78    /// Logs the compiled FSMs as a Graphviz DOT file with the help of the `log` crate.
79    /// To enable debug output compiled FSMs as dot file set the environment variable `RUST_LOG` to
80    /// `scnr::internal::scanner_impl=debug`.
81    ///
82    /// This is not available when the `regex_automata` feature is enabled.
83    pub fn log_compiled_automata_as_dot(&self) -> Result<()> {
84        self.inner.log_compiled_automata_as_dot()
85    }
86
87    /// Generates the compiled FSMs as a Graphviz DOT files.
88    /// The DOT files are written to the target folder.
89    /// The file names are derived from the scanner mode names and the index of the regarding FSM.
90    ///
91    /// This is not available when the `regex_automata` feature is enabled.
92    pub fn generate_compiled_automata_as_dot(
93        &self,
94        prefix: &str,
95        target_folder: &Path,
96    ) -> Result<()> {
97        self.inner
98            .generate_compiled_automata_as_dot(prefix, target_folder)
99    }
100}
101
102impl ScannerModeSwitcher for Scanner {
103    /// Returns the current scanner mode.
104    fn current_mode(&self) -> usize {
105        self.inner.current_mode()
106    }
107
108    /// Sets the current scanner mode.
109    ///
110    /// A parser can explicitly set the scanner mode to switch to a different set of FSMs.
111    /// Usually, the scanner mode is changed by the scanner itself based on the transitions defined
112    /// in the active scanner mode.
113    fn set_mode(&mut self, mode: usize) {
114        trace!("Set scanner mode to {}", mode);
115        self.inner.set_mode(mode);
116    }
117
118    /// Returns the name of the scanner mode with the given index.
119    /// If the index is out of bounds, None is returned.
120    fn mode_name(&self, index: usize) -> Option<&str> {
121        self.inner.mode_name(index)
122    }
123}
124
125/// A scanner can be created from a vector of scanner modes.
126impl TryFrom<Vec<ScannerMode>> for Scanner {
127    type Error = crate::ScnrError;
128
129    fn try_from(scanner_modes: Vec<ScannerMode>) -> Result<Self> {
130        Ok(Scanner {
131            inner: scanner_modes.try_into()?,
132        })
133    }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139    use crate::{Pattern, ScannerBuilder};
140    use std::{fs, sync::Once};
141
142    static INIT: Once = Once::new();
143
144    const TARGET_FOLDER: &str = concat!(
145        env!("CARGO_MANIFEST_DIR"),
146        "/target/testout/test_pathological_regular_expressions_dfa"
147    );
148
149    fn init() {
150        INIT.call_once(|| {
151            let _ = env_logger::builder().is_test(true).try_init();
152            // Delete all previously generated dot files.
153            let _ = fs::remove_dir_all(TARGET_FOLDER);
154            // Create the target folder.
155            fs::create_dir_all(TARGET_FOLDER).unwrap();
156        });
157    }
158
159    #[test]
160    fn test_scanner_builder_with_single_mode() {
161        init();
162        let scanner_mode = ScannerMode::new(
163            "INITIAL",
164            vec![
165                Pattern::new(r"\r\n|\r|\n".to_string(), 1),
166                Pattern::new(r"(//.*(\r\n|\r|\n))".to_string(), 3),
167            ],
168            vec![(1, 1), (3, 1)],
169        );
170        let scanner = ScannerBuilder::new()
171            .add_scanner_mode(scanner_mode)
172            .build()
173            .unwrap();
174        assert_eq!(Some("INITIAL"), scanner.inner.mode_name(0));
175    }
176
177    #[test]
178    // Test the correct sharing of the current mode between the scanner and the scanner impl.
179    fn test_scanner_current_mode() {
180        init();
181        let scanner_mode = ScannerMode::new(
182            "INITIAL",
183            vec![
184                Pattern::new(r"\r\n|\r|\n".to_string(), 1),
185                Pattern::new(r"(//.*(\r\n|\r|\n))".to_string(), 3),
186            ],
187            vec![(1, 1), (3, 1)],
188        );
189        let mut scanner = ScannerBuilder::new()
190            .add_scanner_mode(scanner_mode)
191            .build()
192            .unwrap();
193        // At the beginning, the scanner mode is 0.
194        assert_eq!(0, scanner.current_mode());
195        assert_eq!(0, scanner.inner.current_mode());
196
197        scanner.set_mode(1);
198        assert_eq!(1, scanner.current_mode());
199        assert_eq!(1, scanner.inner.current_mode());
200
201        let mut find_iter = scanner.find_iter("Hello\nWorld");
202        // The creation of a find_iter sets its own scanner mode to 0.
203        assert_eq!(0, find_iter.current_mode());
204
205        assert_eq!(1, scanner.current_mode());
206        assert_eq!(1, scanner.inner.current_mode());
207        assert_eq!(1, scanner.inner.clone().current_mode());
208
209        find_iter.set_mode(1);
210        assert_eq!(1, find_iter.current_mode());
211        scanner.set_mode(0);
212        assert_eq!(0, scanner.current_mode());
213        assert_eq!(0, scanner.inner.current_mode());
214        assert_eq!(0, scanner.inner.clone().current_mode());
215    }
216
217    // A test that checks the behavior of the scanner when so called 'pathological regular expressions'
218    // are used. These are regular expressions that are very slow to match.
219    // The test checks if the scanner is able to handle these cases and does not hang.
220    struct TestData {
221        pattern: &'static str,
222        input: &'static str,
223        expected_match: Option<&'static str>,
224    }
225
226    const TEST_DATA: &[TestData] = &[
227        TestData {
228            pattern: r"((a*)*b)",
229            input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
230            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
231        },
232        TestData {
233            pattern: r"(a+)+b",
234            input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
235            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
236        },
237        TestData {
238            pattern: r"(a+)+b",
239            input: "aaaaaaaaaaaaaaaaaaaaaaaaaa",
240            expected_match: None,
241        },
242        TestData {
243            pattern: r"(a|aa)+b",
244            input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
245            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
246        },
247        TestData {
248            pattern: r"(a|a?)+b",
249            input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
250            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
251        },
252        TestData {
253            pattern: r"((a|aa|aaa|aaaa|aaaaa)*)*b",
254            input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
255            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
256        },
257        TestData {
258            pattern: r"((a*a*a*a*a*a*)*)*b",
259            input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
260            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
261        },
262        TestData {
263            pattern: r"a{3}{3}*b",
264            input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
265            expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaaab"),
266        },
267        // This test is disabled because it takes too long to run.
268        // It works fine when the DFA is not optimized.
269        // This should be analyzed thoroughly.
270        // TestData {
271        //     pattern: r"a{5}{5}{5}{5}{5}{5}*b",
272        //     input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
273        //     expected_match: Some("b"),
274        // },
275    ];
276
277    #[test]
278    fn test_pathological_regular_expressions_dfa() {
279        init();
280
281        #[allow(unused_variables)]
282        for (index, test) in TEST_DATA.iter().enumerate() {
283            let scanner_mode = ScannerMode::new(
284                "INITIAL",
285                vec![Pattern::new(test.pattern.to_string(), 1)],
286                vec![],
287            );
288            let scanner = ScannerBuilder::new()
289                .add_scanner_mode(scanner_mode.clone())
290                .build()
291                .unwrap();
292
293            #[cfg(not(feature = "regex_automata"))]
294            scanner
295                .generate_compiled_automata_as_dot(
296                    &format!("Test{}", index),
297                    Path::new(&TARGET_FOLDER),
298                )
299                .unwrap();
300
301            let mut find_iter = scanner.find_iter(test.input);
302            let match1 = find_iter.next();
303            assert_eq!(
304                test.expected_match,
305                match1.map(|m| test.input.get(m.start()..m.end()).unwrap())
306            );
307        }
308    }
309}