scnr/scanner.rs
1use std::{fmt::Debug, path::Path};
2
3use log::trace;
4
5use crate::internal::ScannerImpl;
6
7use crate::{FindMatches, Result, ScannerMode};
8
9/// A trait to switch between scanner modes.
10///
11/// This trait is used to switch between different scanner modes from a parser's perspective.
12/// The parser can set the current scanner mode to switch to a different set of Finite State
13/// Machines, FSMs.
14/// Usually, the scanner mode is changed by the scanner itself based on the transitions defined
15/// in the active scanner mode.
16///
17/// It is discouraged to use a mix of scanner induced mode changes and parser induced mode changes.
18/// This can lead to unexpected behavior and is not recommended.
19///
20/// Only several kinds of parsers are able to handle mode changes as part of the grammar.
21/// Note that 'part of the grammar' means that the mode changes are part of the parser's state
22/// machine and not anything implemented in semantic actions.
23///
24/// For example, an LL parser is able to handle scanner mode switches in the grammar because it
25/// always 'knows' the next production to parse. If the production contains a scanner mode switch,
26/// the parser can switch the scanner mode before scanning the next token.
27///
28/// A LR parser is not able to handle mode changes in the grammar because it does not know the next
29/// production to parse. The parser has to decide which production to parse based on the lookahead
30/// tokens. If the lookahead tokens contain a token that needed a scanner mode switch, the parser
31/// is not able to switch the scanner mode before reading the next token.
32///
33/// An example of a parser induced mode changes is the `parol` parser generator. It is able to
34/// handle mode changes in the grammar because it generates LL parsers. The parser is able to switch
35/// the scanner mode before scanning the next token.
36/// `parol` is also able to handle scanner induced mode changes stored as transitions in the scanner
37/// modes. The scanner mode changes are then no part of the grammar but instead part of the scanner.
38///
39/// Furthermore `parol` can also generate LR parsers. In this case, the scanner mode changes are not
40/// part of the grammar but instead part of the scanner. `parol` will prevent the LR grammar from
41/// containing scanner mode changes.
42///
43/// See <https://github.com/jsinger67/parol> for more information about the `parol` parser
44/// generator.
45pub trait ScannerModeSwitcher {
46 /// Sets the current scanner mode.
47 fn set_mode(&mut self, mode: usize);
48 /// Returns the current scanner mode.
49 fn current_mode(&self) -> usize;
50 /// Returns the name of the scanner mode with the given index.
51 fn mode_name(&self, index: usize) -> Option<&str>;
52}
53
54/// A Scanner.
55/// It consists of multiple DFAs that are used to search for matches.
56///
57/// Each DFA corresponds to a scanner mode that can recognize the tokens that belongs to it.
58/// Scanner modes are known from Flex as *Start conditions* and provides more
59/// flexibility by defining several scanners for several parts of your grammar.
60/// See <https://www.cs.princeton.edu/~appel/modern/c/software/flex/flex.html#SEC11>
61/// for more information.
62///
63/// To create a scanner, you should use the `ScannerBuilder` to add scanner mode data.
64/// At least one scanner mode must be added to the scanner. This single mode is usually named
65/// `INITIAL`.
66#[derive(Debug)]
67pub struct Scanner {
68 pub(crate) inner: ScannerImpl,
69}
70
71impl Scanner {
72 /// Returns an iterator over all non-overlapping matches.
73 /// The iterator yields a [`crate::Match`] value until no more matches could be found.
74 pub fn find_iter<'h>(&self, input: &'h str) -> FindMatches<'h> {
75 FindMatches::new(self.inner.clone(), input)
76 }
77
78 /// Logs the compiled FSMs as a Graphviz DOT file with the help of the `log` crate.
79 /// To enable debug output compiled FSMs as dot file set the environment variable `RUST_LOG` to
80 /// `scnr::internal::scanner_impl=debug`.
81 ///
82 /// This is not available when the `regex_automata` feature is enabled.
83 pub fn log_compiled_automata_as_dot(&self) -> Result<()> {
84 self.inner.log_compiled_automata_as_dot()
85 }
86
87 /// Generates the compiled FSMs as a Graphviz DOT files.
88 /// The DOT files are written to the target folder.
89 /// The file names are derived from the scanner mode names and the index of the regarding FSM.
90 ///
91 /// This is not available when the `regex_automata` feature is enabled.
92 pub fn generate_compiled_automata_as_dot(
93 &self,
94 prefix: &str,
95 target_folder: &Path,
96 ) -> Result<()> {
97 self.inner
98 .generate_compiled_automata_as_dot(prefix, target_folder)
99 }
100}
101
102impl ScannerModeSwitcher for Scanner {
103 /// Returns the current scanner mode.
104 fn current_mode(&self) -> usize {
105 self.inner.current_mode()
106 }
107
108 /// Sets the current scanner mode.
109 ///
110 /// A parser can explicitly set the scanner mode to switch to a different set of FSMs.
111 /// Usually, the scanner mode is changed by the scanner itself based on the transitions defined
112 /// in the active scanner mode.
113 fn set_mode(&mut self, mode: usize) {
114 trace!("Set scanner mode to {}", mode);
115 self.inner.set_mode(mode);
116 }
117
118 /// Returns the name of the scanner mode with the given index.
119 /// If the index is out of bounds, None is returned.
120 fn mode_name(&self, index: usize) -> Option<&str> {
121 self.inner.mode_name(index)
122 }
123}
124
125/// A scanner can be created from a vector of scanner modes.
126impl TryFrom<Vec<ScannerMode>> for Scanner {
127 type Error = crate::ScnrError;
128
129 fn try_from(scanner_modes: Vec<ScannerMode>) -> Result<Self> {
130 Ok(Scanner {
131 inner: scanner_modes.try_into()?,
132 })
133 }
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139 use crate::{Pattern, ScannerBuilder};
140 use std::{fs, sync::Once};
141
142 static INIT: Once = Once::new();
143
144 const TARGET_FOLDER: &str = concat!(
145 env!("CARGO_MANIFEST_DIR"),
146 "/target/testout/test_pathological_regular_expressions_dfa"
147 );
148
149 fn init() {
150 INIT.call_once(|| {
151 let _ = env_logger::builder().is_test(true).try_init();
152 // Delete all previously generated dot files.
153 let _ = fs::remove_dir_all(TARGET_FOLDER);
154 // Create the target folder.
155 fs::create_dir_all(TARGET_FOLDER).unwrap();
156 });
157 }
158
159 #[test]
160 fn test_scanner_builder_with_single_mode() {
161 init();
162 let scanner_mode = ScannerMode::new(
163 "INITIAL",
164 vec![
165 Pattern::new(r"\r\n|\r|\n".to_string(), 1),
166 Pattern::new(r"(//.*(\r\n|\r|\n))".to_string(), 3),
167 ],
168 vec![(1, 1), (3, 1)],
169 );
170 let scanner = ScannerBuilder::new()
171 .add_scanner_mode(scanner_mode)
172 .build()
173 .unwrap();
174 assert_eq!(Some("INITIAL"), scanner.inner.mode_name(0));
175 }
176
177 #[test]
178 // Test the correct sharing of the current mode between the scanner and the scanner impl.
179 fn test_scanner_current_mode() {
180 init();
181 let scanner_mode = ScannerMode::new(
182 "INITIAL",
183 vec![
184 Pattern::new(r"\r\n|\r|\n".to_string(), 1),
185 Pattern::new(r"(//.*(\r\n|\r|\n))".to_string(), 3),
186 ],
187 vec![(1, 1), (3, 1)],
188 );
189 let mut scanner = ScannerBuilder::new()
190 .add_scanner_mode(scanner_mode)
191 .build()
192 .unwrap();
193 // At the beginning, the scanner mode is 0.
194 assert_eq!(0, scanner.current_mode());
195 assert_eq!(0, scanner.inner.current_mode());
196
197 scanner.set_mode(1);
198 assert_eq!(1, scanner.current_mode());
199 assert_eq!(1, scanner.inner.current_mode());
200
201 let mut find_iter = scanner.find_iter("Hello\nWorld");
202 // The creation of a find_iter sets its own scanner mode to 0.
203 assert_eq!(0, find_iter.current_mode());
204
205 assert_eq!(1, scanner.current_mode());
206 assert_eq!(1, scanner.inner.current_mode());
207 assert_eq!(1, scanner.inner.clone().current_mode());
208
209 find_iter.set_mode(1);
210 assert_eq!(1, find_iter.current_mode());
211 scanner.set_mode(0);
212 assert_eq!(0, scanner.current_mode());
213 assert_eq!(0, scanner.inner.current_mode());
214 assert_eq!(0, scanner.inner.clone().current_mode());
215 }
216
217 // A test that checks the behavior of the scanner when so called 'pathological regular expressions'
218 // are used. These are regular expressions that are very slow to match.
219 // The test checks if the scanner is able to handle these cases and does not hang.
220 struct TestData {
221 pattern: &'static str,
222 input: &'static str,
223 expected_match: Option<&'static str>,
224 }
225
226 const TEST_DATA: &[TestData] = &[
227 TestData {
228 pattern: r"((a*)*b)",
229 input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
230 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
231 },
232 TestData {
233 pattern: r"(a+)+b",
234 input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
235 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
236 },
237 TestData {
238 pattern: r"(a+)+b",
239 input: "aaaaaaaaaaaaaaaaaaaaaaaaaa",
240 expected_match: None,
241 },
242 TestData {
243 pattern: r"(a|aa)+b",
244 input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
245 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
246 },
247 TestData {
248 pattern: r"(a|a?)+b",
249 input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
250 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
251 },
252 TestData {
253 pattern: r"((a|aa|aaa|aaaa|aaaaa)*)*b",
254 input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
255 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
256 },
257 TestData {
258 pattern: r"((a*a*a*a*a*a*)*)*b",
259 input: "aaaaaaaaaaaaaaaaaaaaaaaaaab",
260 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaab"),
261 },
262 TestData {
263 pattern: r"a{3}{3}*b",
264 input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
265 expected_match: Some("aaaaaaaaaaaaaaaaaaaaaaaaaaab"),
266 },
267 // This test is disabled because it takes too long to run.
268 // It works fine when the DFA is not optimized.
269 // This should be analyzed thoroughly.
270 // TestData {
271 // pattern: r"a{5}{5}{5}{5}{5}{5}*b",
272 // input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
273 // expected_match: Some("b"),
274 // },
275 ];
276
277 #[test]
278 fn test_pathological_regular_expressions_dfa() {
279 init();
280
281 #[allow(unused_variables)]
282 for (index, test) in TEST_DATA.iter().enumerate() {
283 let scanner_mode = ScannerMode::new(
284 "INITIAL",
285 vec![Pattern::new(test.pattern.to_string(), 1)],
286 vec![],
287 );
288 let scanner = ScannerBuilder::new()
289 .add_scanner_mode(scanner_mode.clone())
290 .build()
291 .unwrap();
292
293 #[cfg(not(feature = "regex_automata"))]
294 scanner
295 .generate_compiled_automata_as_dot(
296 &format!("Test{}", index),
297 Path::new(&TARGET_FOLDER),
298 )
299 .unwrap();
300
301 let mut find_iter = scanner.find_iter(test.input);
302 let match1 = find_iter.next();
303 assert_eq!(
304 test.expected_match,
305 match1.map(|m| test.input.get(m.start()..m.end()).unwrap())
306 );
307 }
308 }
309}