use fancy_regex::{Regex, RegexBuilder};
use std::collections::HashSet;
use std::fmt::{Debug, Display, Formatter};
use std::hash::Hash;
use std::sync::Arc;
#[derive(Clone, Hash, Eq, PartialEq)]
pub enum Symbol {
Terminal(String),
NonTerminal(String),
}
impl Symbol {
pub fn as_str(&self) -> &str {
match self {
Symbol::Terminal(value) => value.as_str(),
Symbol::NonTerminal(value) => value.as_str(),
}
}
pub fn get_value(&self) -> String {
match self {
Symbol::Terminal(value) => value.clone(),
Symbol::NonTerminal(value) => value.clone(),
}
}
#[inline(always)]
pub fn is_terminal(&self) -> bool {
matches!(self, Symbol::Terminal(_))
}
pub fn starts_with(&self, prefix: &str) -> bool {
match self {
Symbol::Terminal(value) => value.starts_with(prefix),
Symbol::NonTerminal(value) => value.starts_with(prefix),
}
}
}
impl Debug for Symbol {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
match self {
Symbol::NonTerminal(name) => format!("NonTerminal({name})"),
Symbol::Terminal(name) => format!("Terminal({name})"),
}
)
}
}
#[derive(Debug, Clone)]
pub(crate) enum Pattern {
PatternStr(String),
PatternRegex(Regex),
}
impl Pattern {
pub(crate) fn capture(&self, text: &str) -> Option<(usize, usize)> {
match self {
Pattern::PatternStr(name) => {
if text.starts_with(name) {
return Some((name.len(), name.len()));
}
None
}
Pattern::PatternRegex(regex) => {
if let Ok(Some(caps)) = regex.captures(text)
&& caps.len() > 0
&& let Some(mt) = caps.get(0)
{
return Some((mt.end(), mt.as_str().len()));
}
None
}
}
}
}
#[derive(Debug, Clone)]
pub(crate) struct RegexFlag {
pub(crate) i: bool, pub(crate) m: bool, pub(crate) s: bool, pub(crate) u: bool, pub(crate) x: bool, }
impl Default for RegexFlag {
fn default() -> Self {
Self {
i: false,
m: false,
s: false,
u: true,
x: false,
}
}
}
#[derive(Debug, Clone)]
pub struct TerminalDef {
pub(crate) name: Arc<Symbol>,
#[allow(dead_code)]
pub(crate) value: String,
pub(crate) pattern: Pattern,
pub(crate) max_width: usize,
pub(crate) priority: usize,
}
impl TerminalDef {
pub(crate) fn with_string(name: &str, value: &str, priority: usize) -> Self {
let name = Arc::new(Symbol::Terminal(name.to_string()));
Self {
name,
value: value.to_string(),
pattern: Pattern::PatternStr(value.to_string()),
max_width: value.len(),
priority,
}
}
pub(crate) fn with_regex(
name: &str,
value: &str,
regex_flag: RegexFlag,
priority: usize,
) -> Self {
let name = Arc::new(Symbol::Terminal(name.to_string()));
let (pattern, max_width) = {
let rb = RegexBuilder::new((r"^".to_string() + value).as_str())
.case_insensitive(regex_flag.i)
.multi_line(regex_flag.m)
.dot_matches_new_line(regex_flag.s)
.unicode_mode(regex_flag.u)
.verbose_mode(regex_flag.x)
.build()
.unwrap();
let pr = Pattern::PatternRegex(rb);
let max = if value.contains("+") || value.contains("*") {
usize::MAX
} else {
value.len()
};
(pr, max)
};
Self {
name,
value: value.to_string(),
pattern,
max_width,
priority,
}
}
pub fn get_name(&self) -> Arc<Symbol> {
self.name.clone()
}
fn capture(&self, text: &str) -> Option<(usize, usize)> {
self.pattern.capture(text)
}
}
impl PartialEq for TerminalDef {
fn eq(&self, other: &Self) -> bool {
self.name == other.name && self.value == other.value
}
}
#[derive(Debug, Clone)]
pub struct Token {
source: Arc<str>,
start: usize,
end: usize,
line: usize,
pub terminal: Arc<Symbol>,
}
impl Token {
pub fn new(
source: impl Into<Arc<str>>,
start: usize,
end: usize,
line: usize,
terminal: Arc<Symbol>,
) -> Self {
Self {
source: source.into(),
start,
end,
line,
terminal,
}
}
pub fn get_start(&self) -> usize {
self.start
}
pub fn get_end(&self) -> usize {
self.end
}
pub fn get_line(&self) -> usize {
self.line
}
pub fn get_terminal(&self) -> String {
self.terminal.get_value()
}
pub fn word(&self) -> &str {
if self.start <= self.end && self.end <= self.source.len() {
&self.source[self.start..self.end]
} else {
&self.source
}
}
}
impl Display for Token {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let text = format!(
"Token {{ word: {}, start: {}, end: {}, line: {}, terminal: {:?} }}",
self.word(),
self.start,
self.end,
self.line,
self.terminal
);
f.write_str(&text)
}
}
impl PartialEq for Token {
fn eq(&self, other: &Self) -> bool {
self.word() == other.word() && self.line == other.line && self.terminal == other.terminal
}
}
impl Eq for Token {}
#[derive(Debug, Clone)]
pub struct Tokenizer {
text: Arc<str>,
start: usize,
line: usize,
len: usize,
terminals: Vec<Arc<TerminalDef>>,
ignore: Arc<HashSet<Arc<Symbol>>>,
}
impl Tokenizer {
pub(crate) fn new(
text: Arc<str>,
terminals: &[Arc<TerminalDef>],
ignore: Arc<HashSet<Arc<Symbol>>>,
) -> Self {
let len = text.len();
Self {
text,
start: 0usize,
line: 0usize,
len,
terminals: terminals.to_owned(),
ignore,
}
}
pub(crate) fn get_text(&self) -> &str {
&self.text
}
}
impl Iterator for Tokenizer {
type Item = Arc<Token>;
fn next(&mut self) -> Option<Self::Item> {
if self.start < self.len {
let slice_text = &self.text[self.start..];
let previous_start = self.start;
for terminal in self.terminals.iter() {
if let Some((mt_end, mt_len)) = terminal.capture(slice_text) {
return if !self.ignore.contains(&terminal.name) {
let token = Arc::new(Token::new(
self.text.clone(),
self.start,
mt_end + self.start,
self.line,
terminal.name.clone(),
));
if terminal.name.as_ref().as_str() == "_NL" {
self.line += 1;
if self.start == 0 {
self.start += mt_end;
return self.next();
}
}
self.start += mt_len;
Some(token)
} else {
self.start += mt_end;
self.next()
};
}
}
if previous_start == self.start {
let expected_next_token = self
.terminals
.iter()
.map(|x| match x.pattern.clone() {
Pattern::PatternRegex(_) => x.get_name().as_ref().as_str().to_string(),
Pattern::PatternStr(name) => name,
})
.collect::<Vec<String>>()
.join(", ");
panic!(
"Failed during tokenization at location {} of input text, expecting one of the following terminals: ({}).\n{}\n{}^",
previous_start,
expected_next_token,
self.text,
" ".repeat(previous_start)
);
}
}
None
}
}
#[derive(Debug)]
pub(crate) struct LexerConf {
pub terminals: Vec<Arc<TerminalDef>>,
}
impl LexerConf {
pub fn new(terminals: Vec<Arc<TerminalDef>>) -> Self {
Self { terminals }
}
pub fn tokenize(&self, text: &str, ignore: Arc<HashSet<Arc<Symbol>>>) -> Tokenizer {
Tokenizer::new(Arc::<str>::from(text), &self.terminals, ignore)
}
pub(crate) fn skip_ignored(
&self,
text: &str,
mut start: usize,
ignore: &HashSet<Arc<Symbol>>,
) -> usize {
while start < text.len() {
let slice_text = &text[start..];
let mut advanced = false;
for terminal in self.terminals.iter() {
if ignore.contains(&terminal.name)
&& let Some((mt_end, _)) = terminal.capture(slice_text)
{
start += mt_end;
advanced = true;
break;
}
}
if !advanced {
break;
}
}
start
}
pub(crate) fn match_terminal(
&self,
text: &str,
start: usize,
expected: &Arc<Symbol>,
ignore: &HashSet<Arc<Symbol>>,
) -> Option<Arc<Token>> {
let start = self.skip_ignored(text, start, ignore);
if start >= text.len() {
return None;
}
let slice_text = &text[start..];
let line = text[..start].chars().filter(|ch| *ch == '\n').count();
for terminal in self.terminals.iter() {
if &terminal.name == expected
&& let Some((mt_end, _)) = terminal.capture(slice_text)
{
return Some(Arc::new(Token::new(
Arc::<str>::from(text),
start,
start + mt_end,
line,
terminal.name.clone(),
)));
}
if &terminal.name == expected {
break;
}
}
None
}
}
pub fn get_symbol(word: &str) -> Arc<Symbol> {
if word.chars().any(|x| x.is_ascii_lowercase()) {
return Arc::new(Symbol::NonTerminal(word.to_string()));
}
Arc::new(Symbol::Terminal(word.to_string()))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn symbol_helpers_work() {
let t = Symbol::Terminal("INT".to_string());
let nt = Symbol::NonTerminal("expr".to_string());
assert_eq!(t.as_str(), "INT");
assert_eq!(nt.get_value(), "expr".to_string());
assert!(t.is_terminal());
assert!(!nt.is_terminal());
assert!(nt.starts_with("ex"));
}
#[test]
fn regex_flag_default_is_expected() {
let f = RegexFlag::default();
assert!(!f.i);
assert!(!f.m);
assert!(!f.s);
assert!(f.u);
assert!(!f.x);
}
#[test]
fn terminal_def_with_string_and_regex_capture() {
let st = TerminalDef::with_string("PLUS", "+", 0);
let rg = TerminalDef::with_regex("INT", r"\d+", RegexFlag::default(), 0);
assert_eq!(st.get_name().as_ref().as_str(), "PLUS");
assert_eq!(st.capture("+1").unwrap().0, 1);
assert_eq!(rg.capture("123abc").unwrap().0, 3);
}
#[test]
fn token_new_sets_fields() {
let tk = Token::new(
Arc::<str>::from("xabc"),
1,
4,
2,
Arc::new(Symbol::Terminal("ID".to_string())),
);
assert_eq!(tk.word(), "abc");
assert_eq!(tk.start, 1);
assert_eq!(tk.end, 4);
assert_eq!(tk.line, 2);
}
#[test]
fn token_accessors_and_terminal_name_work() {
let tk = Token::new(
Arc::<str>::from("alpha beta"),
6,
10,
3,
Arc::new(Symbol::Terminal("WORD".to_string())),
);
assert_eq!(tk.get_start(), 6);
assert_eq!(tk.get_end(), 10);
assert_eq!(tk.get_line(), 3);
assert_eq!(tk.get_terminal(), "WORD".to_string());
assert_eq!(tk.word(), "beta");
}
#[test]
fn token_word_falls_back_to_source_for_invalid_bounds() {
let reversed = Token::new(
Arc::<str>::from("hello"),
4,
2,
0,
Arc::new(Symbol::Terminal("TEXT".to_string())),
);
let beyond_end = Token::new(
Arc::<str>::from("hello"),
0,
10,
0,
Arc::new(Symbol::Terminal("TEXT".to_string())),
);
assert_eq!(reversed.word(), "hello");
assert_eq!(beyond_end.word(), "hello");
}
#[test]
fn token_display_contains_core_metadata() {
let tk = Token::new(
Arc::<str>::from("sum"),
0,
3,
1,
Arc::new(Symbol::Terminal("IDENT".to_string())),
);
assert_eq!(
tk.to_string(),
"Token { word: sum, start: 0, end: 3, line: 1, terminal: Terminal(IDENT) }"
);
}
#[test]
fn token_equality_depends_on_word_line_and_terminal() {
let lhs = Token::new(
Arc::<str>::from("abc def"),
0,
3,
2,
Arc::new(Symbol::Terminal("IDENT".to_string())),
);
let same = Token::new(
Arc::<str>::from("abc xyz"),
0,
3,
2,
Arc::new(Symbol::Terminal("IDENT".to_string())),
);
let different_line = Token::new(
Arc::<str>::from("abc def"),
0,
3,
4,
Arc::new(Symbol::Terminal("IDENT".to_string())),
);
let different_terminal = Token::new(
Arc::<str>::from("abc def"),
0,
3,
2,
Arc::new(Symbol::Terminal("NUMBER".to_string())),
);
assert_eq!(lhs, same);
assert_ne!(lhs, different_line);
assert_ne!(lhs, different_terminal);
}
#[test]
fn tokenizer_and_lexer_conf_tokenize_with_ignore() {
let terminals = vec![
Arc::new(TerminalDef::with_regex(
"_NL",
r"\n+",
RegexFlag::default(),
0,
)),
Arc::new(TerminalDef::with_regex(
"WS",
r"[ ]+",
RegexFlag::default(),
0,
)),
Arc::new(TerminalDef::with_regex(
"INT",
r"\d+",
RegexFlag::default(),
0,
)),
];
let lexer = LexerConf::new(terminals);
let mut tokenizer = lexer.tokenize(
"12 34\n56",
Arc::new(
[
Arc::new(Symbol::Terminal("WS".to_string())),
Arc::new(Symbol::Terminal("_NL".to_string())),
]
.into_iter()
.collect(),
),
);
let words = tokenizer
.by_ref()
.map(|x| x.word().to_string())
.collect::<Vec<_>>();
assert_eq!(
words,
vec!["12".to_string(), "34".to_string(), "56".to_string()]
);
assert_eq!(tokenizer.get_text(), "12 34\n56");
}
#[test]
fn tokenizer_panics_on_unmatched_input() {
let terminals = vec![Arc::new(TerminalDef::with_string("A", "a", 0))];
let mut tokenizer =
Tokenizer::new(Arc::<str>::from("x"), &terminals, Arc::new(HashSet::new()));
let panicked = std::panic::catch_unwind(move || {
let _ = tokenizer.next();
});
assert!(panicked.is_err());
}
#[test]
fn get_symbol_classifies_terminal_vs_non_terminal() {
let nt = get_symbol("expr");
let t = get_symbol("INT");
assert_eq!(nt.as_ref().as_str(), "expr");
assert_eq!(t.as_ref().as_str(), "INT");
assert!(!nt.is_terminal());
assert!(t.is_terminal());
}
}