1use crate::{GrammarConfig, parser::parol_grammar::ScannerStateSwitch};
2use anyhow::{Result, bail};
3use parol_runtime::{
4 TerminalIndex,
5 lexer::{
6 BLOCK_COMMENT, ERROR_TOKEN, FIRST_USER_TOKEN, LINE_COMMENT, NEW_LINE, NEW_LINE_TOKEN,
7 WHITESPACE, WHITESPACE_TOKEN,
8 },
9};
10use std::fmt::{Debug, Display, Error, Formatter};
11
12type TerminalMapping = (String, TerminalIndex, Option<(bool, String)>, String);
14type ScannerTransition = (TerminalIndex, ScannerStateSwitch);
16type BuildInformation = (Vec<TerminalMapping>, Vec<ScannerTransition>);
18
19#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
28pub struct ScannerConfig {
29 pub scanner_name: String,
33
34 pub scanner_state: usize,
38
39 pub line_comments: Vec<String>,
43
44 pub block_comments: Vec<(String, String)>,
49
50 pub auto_newline: bool,
55
56 pub auto_ws: bool,
61
62 pub allow_unmatched: bool,
64
65 pub transitions: Vec<(TerminalIndex, ScannerStateSwitch)>,
69}
70
71impl ScannerConfig {
72 pub fn new(scanner_name: String, scanner_state: usize) -> Self {
74 Self {
75 scanner_name,
76 scanner_state,
77 line_comments: Vec::new(),
78 block_comments: Vec::new(),
79 auto_newline: true,
80 auto_ws: true,
81 allow_unmatched: false,
82 transitions: Vec::new(),
83 }
84 }
85
86 pub fn with_line_comments(mut self, line_comments: Vec<String>) -> Self {
88 self.line_comments = line_comments;
89 self
90 }
91
92 pub fn with_block_comments(mut self, block_comments: Vec<(String, String)>) -> Self {
94 self.block_comments = block_comments;
95 self
96 }
97
98 pub fn with_auto_newline(mut self, auto_newline: bool) -> Self {
100 self.auto_newline = auto_newline;
101 self
102 }
103
104 pub fn with_auto_ws(mut self, auto_ws: bool) -> Self {
106 self.auto_ws = auto_ws;
107 self
108 }
109
110 pub fn with_allow_unmatched(mut self, allow_unmatched: bool) -> Self {
112 self.allow_unmatched = allow_unmatched;
113 self
114 }
115
116 pub fn generate_build_information(
121 &self,
122 grammar_config: &GrammarConfig,
123 terminal_names: &[String],
124 ) -> Result<BuildInformation> {
125 let cfg = &grammar_config.cfg;
126 let mut terminal_mappings = Vec::new();
127 if self.auto_newline {
128 terminal_mappings.push((
129 NEW_LINE_TOKEN.to_owned(),
130 NEW_LINE,
131 None,
132 terminal_names[NEW_LINE as usize].clone(),
133 ));
134 }
135 if self.auto_ws {
136 terminal_mappings.push((
137 WHITESPACE_TOKEN.to_owned(),
138 WHITESPACE,
139 None,
140 terminal_names[WHITESPACE as usize].clone(),
141 ));
142 }
143 if !self.line_comments.is_empty() {
144 let line_comments_rx = self
145 .line_comments
146 .iter()
147 .map(|s| format!(r###"{s}.*(\r\n|\r|\n)?"###))
148 .collect::<Vec<String>>()
149 .join("|");
150 terminal_mappings.push((
151 line_comments_rx,
152 LINE_COMMENT,
153 None,
154 terminal_names[LINE_COMMENT as usize].clone(),
155 ));
156 }
157 if !self.block_comments.is_empty() {
158 let block_comments_rx = self
159 .block_comments
160 .iter()
161 .map(|(s, e)| Self::format_block_comment(s, e))
162 .collect::<Result<Vec<String>>>()?
163 .join("|");
164 terminal_mappings.push((
165 block_comments_rx,
166 BLOCK_COMMENT,
167 None,
168 terminal_names[BLOCK_COMMENT as usize].clone(),
169 ));
170 }
171
172 let mut terminal_mappings = cfg.get_ordered_terminals().iter().enumerate().fold(
173 terminal_mappings,
174 |mut acc, (i, (t, k, l, s))| {
175 if s.contains(&self.scanner_state) {
176 acc.push((
177 k.expand(t),
178 i as TerminalIndex + FIRST_USER_TOKEN,
179 l.as_ref()
180 .map(|l| (l.is_positive, l.kind.expand(&l.pattern))),
181 terminal_names[i + FIRST_USER_TOKEN as usize].clone(),
182 ));
183 }
184 acc
185 },
186 );
187 if !self.allow_unmatched {
189 let error_index = terminal_names.len() - 1;
190 terminal_mappings.push((
191 ERROR_TOKEN.to_owned(),
192 error_index as TerminalIndex,
193 None,
194 terminal_names[error_index].clone(),
195 ));
196 }
197
198 Ok((terminal_mappings, self.transitions.clone()))
199 }
200
201 fn format_block_comment(s: &str, e: &str) -> Result<String> {
211 if s == r"/\*" && e == r"\*/" {
213 return Ok(r"/\*/?([^/]|[^*]/)*\*/".to_string());
215 }
216 let len_with_escaped_chars = |s: &str| {
217 let mut prev = None;
218 s.chars()
219 .map(|c| {
220 if c == '\\' && !matches!(prev, Some('\\')) {
221 prev = Some(c);
222 0
223 } else {
224 prev = Some(c);
225 1
226 }
227 })
228 .sum::<usize>()
229 };
230 Ok(match len_with_escaped_chars(e) {
231 0 => bail!("Block comment end is empty."),
232 1 => {
233 let c0 = if e.chars().nth(0).unwrap() == '\\' {
234 if Self::must_escape_in_bracketed_expression(e.chars().nth(1).unwrap()) {
235 e.to_string()
236 } else {
237 e.chars().nth(1).unwrap().escape_default().to_string()
238 }
239 } else {
240 e.to_string()
241 };
242 format!(r"{s}[^{c0}]*{e}")
243 }
244 2 => {
245 let (c0, c1) = if e.chars().nth(0).unwrap() == '\\' {
246 (&e[0..2], &e[2..])
247 } else {
248 (&e[0..1], &e[1..])
249 };
250 let c0c = if c0.len() > 1 {
254 debug_assert_eq!(c0.chars().nth(0).unwrap(), '\\');
255 if Self::must_escape_in_bracketed_expression(c0.chars().nth(1).unwrap()) {
257 c0.to_string()
258 } else {
259 c0.chars().nth(1).unwrap().escape_default().to_string()
260 }
261 } else {
262 debug_assert_eq!(c0.len(), 1);
263 c0.to_string()
264 };
265 let c1c = if c1.len() > 1 {
266 debug_assert_eq!(c1.chars().nth(0).unwrap(), '\\');
267 if Self::must_escape_in_bracketed_expression(c1.chars().nth(1).unwrap()) {
269 c1.to_string()
270 } else {
271 c1.chars().nth(1).unwrap().escape_default().to_string()
272 }
273 } else {
274 debug_assert_eq!(c1.len(), 1);
275 c1.to_string()
276 };
277 format!(r"{s}([^{c0c}]|{c0}[^{c1c}])*{e}")
278 }
279 _ => bail!(
280 r"Block comment end '{}' is too long. Maximum length is 2.
281 Consider using manual comment handling, maybe with different scanner modes.",
282 e
283 ),
284 })
285 }
286
287 fn must_escape_in_bracketed_expression(c: char) -> bool {
288 matches!(c, '-' | ']' | '^' | '\\')
289 }
290}
291
292impl Default for ScannerConfig {
293 fn default() -> Self {
294 Self {
295 scanner_name: "INITIAL".to_string(),
296 scanner_state: 0,
297 line_comments: Vec::new(),
298 block_comments: Vec::new(),
299 auto_newline: true,
300 auto_ws: true,
301 allow_unmatched: false,
302 transitions: Vec::new(),
303 }
304 }
305}
306
307impl Display for ScannerConfig {
308 fn fmt(&self, f: &mut Formatter<'_>) -> std::result::Result<(), Error> {
309 writeln!(f, "scanner_name: {}", self.scanner_name)?;
310 writeln!(f, "scanner_state: {}", self.scanner_state)?;
311 writeln!(f, "line_comments: {:?}", self.line_comments)?;
312 writeln!(f, "block_comments: {:?}", self.block_comments)?;
313 writeln!(f, "auto_newline: {:?}", self.auto_newline)?;
314 writeln!(f, "auto_ws: {:?}", self.auto_ws)?;
315 self.transitions
316 .iter()
317 .try_for_each(|(k, v)| write!(f, "on {k} enter {v};"))
318 }
319}
320
321#[cfg(test)]
322mod tests {
323 use super::*;
324
325 use scnr2::scanner;
326
327 fn format_matches(expected: &[scnr2::Match], input: &str) -> String {
328 format!(
329 "[{}]",
330 expected
331 .iter()
332 .map(|m| format!(
333 "(\"{}\", {}, {})",
334 &input[m.span.start..m.span.end],
335 m.span.start,
336 m.span.end
337 ))
338 .collect::<Vec<_>>()
339 .join(", ")
340 )
341 }
342
343 fn format_expected_matches(expected: &[(&str, usize, usize)]) -> String {
345 format!("{expected:?}")
346 }
347
348 macro_rules! scan_test {
349 ($test_name:ident, $module:ident, $scanner:ident, $pattern:expr, $input:expr, $expected:expr, $test_num:expr) => {
350 scanner! {
351 $scanner {
352 mode M {
353 token $pattern => 0;
354 }
355 }
356 }
357 #[test]
358 fn $test_name() {
359 use $module::$scanner as S;
360 let scanner = S::new();
361 let matches = scanner.find_matches($input, 0).collect::<Vec<_>>();
362 const EXPECTED_MATCHES: &[(&str, usize, usize)] = $expected;
363 assert_eq!(
364 matches.len(),
365 EXPECTED_MATCHES.len(),
366 "{}: Unexpected match count exp: {:?}, act: {:?}",
367 $test_num,
368 format_expected_matches(&EXPECTED_MATCHES),
369 format_matches(&matches, $input)
370 );
371 for (i, ma) in EXPECTED_MATCHES.iter().enumerate() {
372 assert_eq!(
373 matches[i].span.start, ma.1,
374 concat!($test_num, ": Match start does not match")
375 );
376 assert_eq!(
377 matches[i].span.end, ma.2,
378 concat!($test_num, ": Match end does not match")
379 );
380 assert_eq!(
381 &($input)[ma.1..ma.2],
382 ma.0,
383 concat!($test_num, ": Matched substring does not match expected")
384 );
385 }
386 }
387 };
388 }
389
390 #[test]
391 fn test_format_block_comment() {
392 let s = r"/\*";
393 let e = r"\*/";
394 let r = ScannerConfig::format_block_comment(s, e);
395 assert_eq!(r.unwrap(), r"/\*/?([^/]|[^*]/)*\*/");
396
397 let s = r"\{\{";
398 let e = r"\}\}";
399 let r = ScannerConfig::format_block_comment(s, e);
400 assert_eq!(r.unwrap(), r"\{\{([^}]|\}[^}])*\}\}");
401
402 let s = "--";
403 let e = "--";
404 let r = ScannerConfig::format_block_comment(s, e);
405 assert_eq!(r.unwrap(), r"--([^-]|-[^-])*--");
406
407 let s = "#";
408 let e = "#";
409 let r = ScannerConfig::format_block_comment(s, e);
410 assert_eq!(r.unwrap(), r"#[^#]*#");
411
412 let s = r"\{";
413 let e = r"\}";
414 let r = ScannerConfig::format_block_comment(s, e);
415 assert_eq!(r.unwrap(), r"\{[^}]*\}");
416 }
417
418 scan_test!(
419 test_block_comment_1,
420 scanner1,
421 Scanner1,
422 r"/\*/?([^/]|[^*]/)*\*/",
423 "code /* comment */ more code",
424 &[("/* comment */", 5, 18)],
425 "Test 1: Simple block comment"
426 );
427
428 scan_test!(
429 test_block_comment_2,
430 scanner2,
431 Scanner2,
432 r"/\*/?([^/]|[^*]/)*\*/",
433 "code /***/ more code /* comment */ /* com*ment */",
434 &[
435 ("/***/", 5, 10),
436 ("/* comment */", 21, 34),
437 ("/* com*ment */", 35, 49)
438 ],
439 "Test 2: Multiple block comments with stars inside"
440 );
441
442 scan_test!(
444 test_block_comment_empty,
445 scanner3,
446 Scanner3,
447 r"/\*/?([^/]|[^*]/)*\*/",
448 "code /**/ more code",
449 &[("/**/", 5, 9)],
450 "Test 3: Empty block comment"
451 );
452
453 scan_test!(
454 test_block_comment_triple_star,
455 scanner4,
456 Scanner4,
457 r"/\*/?([^/]|[^*]/)*\*/",
458 "code /****/ more code",
459 &[("/****/", 5, 11)],
460 "Test 4: Triple star comment"
461 );
462
463 scan_test!(
464 test_block_comment_start_end_token,
465 scanner5,
466 Scanner5,
467 r"/\*/?([^/]|[^*]/)*\*/",
468 "code /***/ more code",
469 &[("/***/", 5, 10)],
470 "Test 5: Block comment with only start of end token"
471 );
472
473 scan_test!(
474 test_block_comment_regular_content,
475 scanner6,
476 Scanner6,
477 r"/\*/?([^/]|[^*]/)*\*/",
478 "/* normal comment */ /* another * comment */",
479 &[
480 ("/* normal comment */", 0, 20),
481 ("/* another * comment */", 21, 44)
482 ],
483 "Test 6: Regular block comments with content"
484 );
485
486 scan_test!(
487 test_block_comment_multiple_sequence,
488 scanner7,
489 Scanner7,
490 r"/\*/?([^/]|[^*]/)*\*/",
491 "/**/ /* a */ /****/ /* b*c */ /**/",
492 &[
493 ("/**/", 0, 4),
494 ("/* a */", 5, 12),
495 ("/****/", 13, 19),
496 ("/* b*c */", 20, 29),
497 ("/**/", 30, 34)
498 ],
499 "Test 7: Multiple block comments in sequence"
500 );
501
502 scan_test!(
503 test_block_comment_complex_edge_cases,
504 scanner8,
505 Scanner8,
506 r"/\*/?([^/]|[^*]/)*\*/",
507 "/*/ not end */ /* ** */ /***/",
508 &[
509 ("/*/ not end */", 0, 14),
510 ("/* ** */", 15, 23),
511 ("/***/", 24, 29)
512 ],
513 "Test 8: Complex edge cases with various star patterns"
514 );
515
516 scan_test!(
517 test_block_comment_complex_edge_cases_different_delimiters,
518 scanner9,
519 Scanner9,
520 r"\{\{([^}]|\}[^}])*\}\}",
521 "{{} not end }} {{ {} }} {{{{}}",
522 &[
523 ("{{} not end }}", 0, 14),
524 ("{{ {} }}", 15, 23),
525 ("{{{{}}", 24, 30)
526 ],
527 "Test 9: Complex edge cases with different block comment delimiters"
528 );
529}