use crate::{GrammarConfig, parser::parol_grammar::ScannerStateSwitch};
use anyhow::{Result, bail};
use parol_runtime::{
TerminalIndex,
lexer::{
BLOCK_COMMENT, ERROR_TOKEN, FIRST_USER_TOKEN, LINE_COMMENT, NEW_LINE, NEW_LINE_TOKEN,
WHITESPACE, WHITESPACE_TOKEN,
},
};
use std::fmt::{Debug, Display, Error, Formatter};
type TerminalMapping = (String, TerminalIndex, Option<(bool, String)>, String);
type ScannerTransition = (TerminalIndex, ScannerStateSwitch);
type BuildInformation = (Vec<TerminalMapping>, Vec<ScannerTransition>);
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct ScannerConfig {
pub scanner_name: String,
pub scanner_state: usize,
pub line_comments: Vec<String>,
pub block_comments: Vec<(String, String)>,
pub auto_newline: bool,
pub auto_ws: bool,
pub allow_unmatched: bool,
pub skip_tokens: Vec<TerminalIndex>,
pub transitions: Vec<(TerminalIndex, ScannerStateSwitch)>,
}
impl ScannerConfig {
pub fn new(scanner_name: String, scanner_state: usize) -> Self {
Self {
scanner_name,
scanner_state,
line_comments: Vec::new(),
block_comments: Vec::new(),
auto_newline: true,
auto_ws: true,
allow_unmatched: false,
skip_tokens: Vec::new(),
transitions: Vec::new(),
}
}
pub fn with_line_comments(mut self, line_comments: Vec<String>) -> Self {
self.line_comments = line_comments;
self
}
pub fn with_block_comments(mut self, block_comments: Vec<(String, String)>) -> Self {
self.block_comments = block_comments;
self
}
pub fn with_auto_newline(mut self, auto_newline: bool) -> Self {
self.auto_newline = auto_newline;
self
}
pub fn with_auto_ws(mut self, auto_ws: bool) -> Self {
self.auto_ws = auto_ws;
self
}
pub fn with_allow_unmatched(mut self, allow_unmatched: bool) -> Self {
self.allow_unmatched = allow_unmatched;
self
}
pub fn with_skip_tokens(mut self, skip_tokens: Vec<TerminalIndex>) -> Self {
self.skip_tokens = skip_tokens;
self
}
pub fn generate_build_information(
&self,
grammar_config: &GrammarConfig,
terminal_names: &[String],
) -> Result<BuildInformation> {
let cfg = &grammar_config.cfg;
let mut terminal_mappings = Vec::new();
if self.auto_newline {
terminal_mappings.push((
NEW_LINE_TOKEN.to_owned(),
NEW_LINE,
None,
terminal_names[NEW_LINE as usize].clone(),
));
}
if self.auto_ws {
terminal_mappings.push((
WHITESPACE_TOKEN.to_owned(),
WHITESPACE,
None,
terminal_names[WHITESPACE as usize].clone(),
));
}
if !self.line_comments.is_empty() {
let line_comments_rx = self
.line_comments
.iter()
.map(|s| format!(r###"{s}.*(\r\n|\r|\n)?"###))
.collect::<Vec<String>>()
.join("|");
terminal_mappings.push((
line_comments_rx,
LINE_COMMENT,
None,
terminal_names[LINE_COMMENT as usize].clone(),
));
}
if !self.block_comments.is_empty() {
let block_comments_rx = self
.block_comments
.iter()
.map(|(s, e)| Self::format_block_comment(s, e))
.collect::<Result<Vec<String>>>()?
.join("|");
terminal_mappings.push((
block_comments_rx,
BLOCK_COMMENT,
None,
terminal_names[BLOCK_COMMENT as usize].clone(),
));
}
let mut terminal_mappings = cfg.get_ordered_terminals().iter().enumerate().fold(
terminal_mappings,
|mut acc, (i, (t, k, l, s))| {
if s.contains(&self.scanner_state) {
acc.push((
k.expand(t),
i as TerminalIndex + FIRST_USER_TOKEN,
l.as_ref()
.map(|l| (l.is_positive, l.kind.expand(&l.pattern))),
terminal_names[i + FIRST_USER_TOKEN as usize].clone(),
));
}
acc
},
);
if !self.allow_unmatched {
let error_index = terminal_names.len() - 1;
terminal_mappings.push((
ERROR_TOKEN.to_owned(),
error_index as TerminalIndex,
None,
terminal_names[error_index].clone(),
));
}
Ok((terminal_mappings, self.transitions.clone()))
}
fn format_block_comment(s: &str, e: &str) -> Result<String> {
if s == r"/\*" && e == r"\*/" {
return Ok(r"/\*/?([^/]|[^*]/)*\*/".to_string());
}
let split_escaped_atoms = |pattern: &str| -> Result<Vec<String>> {
let chars: Vec<char> = pattern.chars().collect();
let mut atoms = Vec::new();
let mut i = 0;
while i < chars.len() {
if chars[i] == '\\' {
if i + 1 >= chars.len() {
bail!("Block comment end contains dangling escape: '{}'.", pattern);
}
atoms.push(format!(r"\{}", chars[i + 1]));
i += 2;
} else {
atoms.push(chars[i].to_string());
i += 1;
}
}
Ok(atoms)
};
let class_safe_atom = |atom: &str| -> String {
let mut chars = atom.chars();
let first = chars.next().unwrap();
let ch = if first == '\\' {
chars.next().unwrap()
} else {
first
};
if Self::must_escape_in_bracketed_expression(ch) {
format!(r"\{ch}")
} else {
ch.escape_default().to_string()
}
};
let atoms = split_escaped_atoms(e)?;
if atoms.is_empty() {
bail!("Block comment end is empty.");
}
if atoms.len() > 3 {
bail!(
r"Block comment end '{}' is too long. Maximum length is 3.
Consider using manual comment handling, maybe with different scanner modes.",
e
);
}
if atoms.len() == 2 {
let (a0, a1) = (&atoms[0], &atoms[1]);
let (c0, c1) = (&class_safe_atom(a0), &class_safe_atom(a1));
if a0 == a1 {
return Ok(format!(r"{s}([^{c0}]|{a0}[^{c1}])*{e}"));
}
let excluded = format!("{c0}{c1}");
return Ok(format!(
r"{s}[^{c0}]*({a0}+[^{excluded}][^{c0}]*)*{a0}+{a1}"
));
}
let class_safe: Vec<String> = atoms.iter().map(|a| class_safe_atom(a)).collect();
let mut alternatives = Vec::with_capacity(atoms.len());
alternatives.push(format!(r"[^{}]", class_safe[0]));
for i in 1..atoms.len() {
let prefix = atoms[..i].join("");
alternatives.push(format!(r"{prefix}[^{}]", class_safe[i]));
}
Ok(format!(r"{s}({})*{e}", alternatives.join("|")))
}
fn must_escape_in_bracketed_expression(c: char) -> bool {
matches!(c, '-' | ']' | '^' | '\\')
}
}
impl Default for ScannerConfig {
fn default() -> Self {
Self {
scanner_name: "INITIAL".to_string(),
scanner_state: 0,
line_comments: Vec::new(),
block_comments: Vec::new(),
auto_newline: true,
auto_ws: true,
allow_unmatched: false,
skip_tokens: Vec::new(),
transitions: Vec::new(),
}
}
}
impl Display for ScannerConfig {
fn fmt(&self, f: &mut Formatter<'_>) -> std::result::Result<(), Error> {
writeln!(f, "scanner_name: {}", self.scanner_name)?;
writeln!(f, "scanner_state: {}", self.scanner_state)?;
writeln!(f, "line_comments: {:?}", self.line_comments)?;
writeln!(f, "block_comments: {:?}", self.block_comments)?;
writeln!(f, "auto_newline: {:?}", self.auto_newline)?;
writeln!(f, "auto_ws: {:?}", self.auto_ws)?;
writeln!(f, "skip_tokens: {:?}", self.skip_tokens)?;
self.transitions
.iter()
.try_for_each(|(k, v)| write!(f, "on {k} enter {v};"))
}
}
#[cfg(test)]
mod tests {
use super::*;
use scnr2::scanner;
fn format_matches(expected: &[scnr2::Match], input: &str) -> String {
format!(
"[{}]",
expected
.iter()
.map(|m| format!(
"(\"{}\", {}, {})",
&input[m.span.start..m.span.end],
m.span.start,
m.span.end
))
.collect::<Vec<_>>()
.join(", ")
)
}
fn format_expected_matches(expected: &[(&str, usize, usize)]) -> String {
format!("{expected:?}")
}
macro_rules! scan_test {
($test_name:ident, $module:ident, $scanner:ident, $pattern:expr, $input:expr, $expected:expr, $test_num:expr) => {
scanner! {
$scanner {
mode M {
token $pattern => 0;
}
}
}
#[test]
fn $test_name() {
use $module::$scanner as S;
let scanner = S::new();
let matches = scanner.find_matches($input, 0).collect::<Vec<_>>();
const EXPECTED_MATCHES: &[(&str, usize, usize)] = $expected;
assert_eq!(
matches.len(),
EXPECTED_MATCHES.len(),
"{}: Unexpected match count exp: {:?}, act: {:?}",
$test_num,
format_expected_matches(&EXPECTED_MATCHES),
format_matches(&matches, $input)
);
for (i, ma) in EXPECTED_MATCHES.iter().enumerate() {
assert_eq!(
matches[i].span.start, ma.1,
concat!($test_num, ": Match start does not match")
);
assert_eq!(
matches[i].span.end, ma.2,
concat!($test_num, ": Match end does not match")
);
assert_eq!(
&($input)[ma.1..ma.2],
ma.0,
concat!($test_num, ": Matched substring does not match expected")
);
}
}
};
}
#[test]
fn test_format_block_comment() {
let s = r"/\*";
let e = r"\*/";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"/\*/?([^/]|[^*]/)*\*/");
let s = r"\{\{";
let e = r"\}\}";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"\{\{([^}]|\}[^}])*\}\}");
let s = "--";
let e = "--";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"--([^\-]|-[^\-])*--");
let s = "#";
let e = "#";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"#([^#])*#");
let s = r"\{";
let e = r"\}";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"\{([^}])*\}");
let s = r"\(\*";
let e = r"\*\)";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"\(\*[^*]*(\*+[^*)][^*]*)*\*+\)");
let s = r"\(\(\(";
let e = r"\)\)\)";
let r = ScannerConfig::format_block_comment(s, e);
assert_eq!(r.unwrap(), r"\(\(\(([^)]|\)[^)]|\)\)[^)])*\)\)\)");
let s = "<";
let e = "abcd";
let r = ScannerConfig::format_block_comment(s, e);
assert!(r.is_err());
}
scan_test!(
test_block_comment_1,
scanner1,
Scanner1,
r"/\*/?([^/]|[^*]/)*\*/",
"code /* comment */ more code",
&[("/* comment */", 5, 18)],
"Test 1: Simple block comment"
);
scan_test!(
test_block_comment_2,
scanner2,
Scanner2,
r"/\*/?([^/]|[^*]/)*\*/",
"code /***/ more code /* comment */ /* com*ment */",
&[
("/***/", 5, 10),
("/* comment */", 21, 34),
("/* com*ment */", 35, 49)
],
"Test 2: Multiple block comments with stars inside"
);
scan_test!(
test_block_comment_empty,
scanner3,
Scanner3,
r"/\*/?([^/]|[^*]/)*\*/",
"code /**/ more code",
&[("/**/", 5, 9)],
"Test 3: Empty block comment"
);
scan_test!(
test_block_comment_triple_star,
scanner4,
Scanner4,
r"/\*/?([^/]|[^*]/)*\*/",
"code /****/ more code",
&[("/****/", 5, 11)],
"Test 4: Triple star comment"
);
scan_test!(
test_block_comment_start_end_token,
scanner5,
Scanner5,
r"/\*/?([^/]|[^*]/)*\*/",
"code /***/ more code",
&[("/***/", 5, 10)],
"Test 5: Block comment with only start of end token"
);
scan_test!(
test_block_comment_regular_content,
scanner6,
Scanner6,
r"/\*/?([^/]|[^*]/)*\*/",
"/* normal comment */ /* another * comment */",
&[
("/* normal comment */", 0, 20),
("/* another * comment */", 21, 44)
],
"Test 6: Regular block comments with content"
);
scan_test!(
test_block_comment_multiple_sequence,
scanner7,
Scanner7,
r"/\*/?([^/]|[^*]/)*\*/",
"/**/ /* a */ /****/ /* b*c */ /**/",
&[
("/**/", 0, 4),
("/* a */", 5, 12),
("/****/", 13, 19),
("/* b*c */", 20, 29),
("/**/", 30, 34)
],
"Test 7: Multiple block comments in sequence"
);
scan_test!(
test_block_comment_complex_edge_cases,
scanner8,
Scanner8,
r"/\*/?([^/]|[^*]/)*\*/",
"/*/ not end */ /* ** */ /***/",
&[
("/*/ not end */", 0, 14),
("/* ** */", 15, 23),
("/***/", 24, 29)
],
"Test 8: Complex edge cases with various star patterns"
);
scan_test!(
test_block_comment_complex_edge_cases_different_delimiters,
scanner9,
Scanner9,
r"\{\{([^}]|\}[^}])*\}\}",
"{{} not end }} {{ {} }} {{{{}}",
&[
("{{} not end }}", 0, 14),
("{{ {} }}", 15, 23),
("{{{{}}", 24, 30)
],
"Test 9: Complex edge cases with different block comment delimiters"
);
scan_test!(
test_block_comment_double_star_parentheses,
scanner10,
Scanner10,
r"\(\*\*([^*]|\*[^*]|\*\*[^)])*\*\*\)",
"code (** a * b ** c **) more (***) text",
&[("(** a * b ** c **)", 5, 23)],
"Test 10: Block comment delimited by (** and **)"
);
scanner! {
Scanner11 {
mode M {
token r"\(\*\*([^*]|\*[^*]|\*\*[^)])*\*\*\)|\(\*[^*]*(\*+[^*)][^*]*)*\*+\)|/\*/?([^/]|[^*]/)*\*/|\{([^}])*\}" => 0;
}
}
}
#[test]
fn test_block_comment_mixed_delimiters_sequence() {
use scanner11::Scanner11 as S;
let scanner = S::new();
let input = "(** A **) (* B *) /* C */ { D }";
let matches = scanner.find_matches(input, 0).collect::<Vec<_>>();
const EXPECTED_MATCHES: &[(&str, usize, usize)] = &[
("(** A **)", 0, 9),
("(* B *)", 10, 17),
("/* C */", 18, 25),
("{ D }", 26, 31),
];
assert_eq!(
matches.len(),
EXPECTED_MATCHES.len(),
"Test 11: Mixed block comment delimiters in sequence: Unexpected match count exp: {:?}, act: {:?}",
format_expected_matches(EXPECTED_MATCHES),
format_matches(&matches, input)
);
for (i, ma) in EXPECTED_MATCHES.iter().enumerate() {
assert_eq!(
matches[i].span.start, ma.1,
"Test 11: Match start does not match"
);
assert_eq!(
matches[i].span.end, ma.2,
"Test 11: Match end does not match"
);
assert_eq!(
&(input)[ma.1..ma.2],
ma.0,
"Test 11: Matched substring does not match expected"
);
}
}
}