#![cfg_attr(feature = "strict_docs", allow(missing_docs))]
use adze_ir::{Grammar, SymbolId, TokenPattern};
use regex::Regex;
#[derive(Debug, Clone)]
pub struct TokenWithPosition {
pub symbol_id: SymbolId,
pub text: String,
pub byte_offset: usize,
#[allow(dead_code)]
pub byte_length: usize,
}
pub struct GLRLexer {
token_patterns: Vec<(SymbolId, TokenMatcher)>,
input: String,
position: usize,
}
enum TokenMatcher {
Literal(String),
Regex(Regex),
}
impl TokenMatcher {
fn matches_at(&self, input: &str, pos: usize) -> Option<usize> {
if !input.is_char_boundary(pos) {
return None;
}
match self {
TokenMatcher::Literal(s) => {
if input[pos..].starts_with(s) {
Some(s.len())
} else {
None
}
}
TokenMatcher::Regex(re) => {
if let Some(m) = re.find(&input[pos..]) {
if m.start() == 0 { Some(m.len()) } else { None }
} else {
None
}
}
}
}
}
impl GLRLexer {
pub fn new(grammar: &Grammar, input: String) -> Result<Self, String> {
let mut token_patterns = Vec::new();
for (symbol_id, token) in &grammar.tokens {
let matcher = match &token.pattern {
TokenPattern::String(s) => TokenMatcher::Literal(s.clone()),
TokenPattern::Regex(pattern) => {
let anchored_pattern = if pattern.starts_with('^') {
pattern.clone()
} else {
format!("^{}", pattern)
};
match Regex::new(&anchored_pattern) {
Ok(re) => TokenMatcher::Regex(re),
Err(e) => {
let name = grammar
.rule_names
.get(symbol_id)
.map(|s| s.as_str())
.unwrap_or("unknown");
return Err(format!("Invalid regex for token {}: {}", name, e));
}
}
}
};
token_patterns.push((*symbol_id, matcher));
}
token_patterns.sort_by_key(|(id, _)| id.0);
Ok(Self {
token_patterns,
input,
position: 0,
})
}
pub fn next_token(&mut self) -> Option<TokenWithPosition> {
self.skip_whitespace();
if self.position >= self.input.len() {
return None;
}
let start_pos = self.position;
for (symbol_id, matcher) in &self.token_patterns {
if let Some(len) = matcher.matches_at(&self.input, self.position) {
let end_pos = self.position + len;
if !self.input.is_char_boundary(end_pos) {
continue;
}
let text = self.input[self.position..end_pos].to_string();
self.position = end_pos;
return Some(TokenWithPosition {
symbol_id: *symbol_id,
text,
byte_offset: start_pos,
byte_length: len,
});
}
}
let mut next_pos = self.position + 1;
while next_pos < self.input.len() && !self.input.is_char_boundary(next_pos) {
next_pos += 1;
}
self.position = next_pos;
if self.position < self.input.len() {
self.next_token()
} else {
None
}
}
fn skip_whitespace(&mut self) {
let input_chars: Vec<char> = self.input.chars().collect();
let mut char_pos = 0;
let mut byte_pos = 0;
for (i, ch) in self.input.chars().enumerate() {
if byte_pos >= self.position {
char_pos = i;
break;
}
byte_pos += ch.len_utf8();
}
while char_pos < input_chars.len() {
match input_chars[char_pos] {
' ' | '\t' | '\n' | '\r' => {
self.position += input_chars[char_pos].len_utf8();
char_pos += 1;
}
_ => break,
}
}
}
#[allow(dead_code)]
pub fn reset(&mut self) {
self.position = 0;
}
pub fn tokenize_all(&mut self) -> Vec<TokenWithPosition> {
let mut tokens = Vec::new();
while let Some(token) = self.next_token() {
tokens.push(token);
}
tokens
}
}
#[allow(dead_code)]
pub fn tokenize_and_parse<F>(grammar: &Grammar, input: &str, mut parse_fn: F) -> Result<(), String>
where
F: FnMut(SymbolId, &str, usize),
{
let mut lexer = GLRLexer::new(grammar, input.to_string())?;
while let Some(token) = lexer.next_token() {
parse_fn(token.symbol_id, &token.text, token.byte_offset);
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use adze_ir::{Grammar, Token, TokenPattern};
#[test]
fn test_literal_token_matching() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
Token {
name: "plus".to_string(),
pattern: TokenPattern::String("+".to_string()),
fragile: false,
},
);
grammar.tokens.insert(
SymbolId(2),
Token {
name: "minus".to_string(),
pattern: TokenPattern::String("-".to_string()),
fragile: false,
},
);
let mut lexer = GLRLexer::new(&grammar, "+ - +".to_string()).unwrap();
let token1 = lexer.next_token().unwrap();
assert_eq!(token1.symbol_id, SymbolId(1));
assert_eq!(token1.text, "+");
let token2 = lexer.next_token().unwrap();
assert_eq!(token2.symbol_id, SymbolId(2));
assert_eq!(token2.text, "-");
let token3 = lexer.next_token().unwrap();
assert_eq!(token3.symbol_id, SymbolId(1));
assert_eq!(token3.text, "+");
assert!(lexer.next_token().is_none());
}
#[test]
fn test_regex_token_matching() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
Token {
name: "number".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
},
);
grammar.tokens.insert(
SymbolId(2),
Token {
name: "identifier".to_string(),
pattern: TokenPattern::Regex(r"[a-zA-Z]\w*".to_string()),
fragile: false,
},
);
let mut lexer = GLRLexer::new(&grammar, "123 hello 456 world".to_string()).unwrap();
let tokens = lexer.tokenize_all();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].symbol_id, SymbolId(1));
assert_eq!(tokens[0].text, "123");
assert_eq!(tokens[1].symbol_id, SymbolId(2));
assert_eq!(tokens[1].text, "hello");
assert_eq!(tokens[2].symbol_id, SymbolId(1));
assert_eq!(tokens[2].text, "456");
assert_eq!(tokens[3].symbol_id, SymbolId(2));
assert_eq!(tokens[3].text, "world");
}
#[test]
fn test_mixed_tokens() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
Token {
name: "number".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
},
);
grammar.tokens.insert(
SymbolId(2),
Token {
name: "plus".to_string(),
pattern: TokenPattern::String("+".to_string()),
fragile: false,
},
);
let mut lexer = GLRLexer::new(&grammar, "1 + 2 + 3".to_string()).unwrap();
let tokens = lexer.tokenize_all();
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0].text, "1");
assert_eq!(tokens[1].text, "+");
assert_eq!(tokens[2].text, "2");
assert_eq!(tokens[3].text, "+");
assert_eq!(tokens[4].text, "3");
}
#[test]
fn test_utf8_boundary_safety() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
Token {
name: "word".to_string(),
pattern: TokenPattern::Regex(r"[a-zA-Z]+".to_string()),
fragile: false,
},
);
let input = vec![190u8, 0, 0];
let input_str = String::from_utf8_lossy(&input).to_string();
let mut lexer = GLRLexer::new(&grammar, input_str).unwrap();
let tokens = lexer.tokenize_all();
assert_eq!(tokens.len(), 0); }
#[test]
fn test_multibyte_character_handling() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
Token {
name: "letter".to_string(),
pattern: TokenPattern::Regex(r"[a-zA-Z]".to_string()),
fragile: false,
},
);
let input = "a🦀b".to_string();
let mut lexer = GLRLexer::new(&grammar, input).unwrap();
let tokens = lexer.tokenize_all();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "a");
assert_eq!(tokens[1].text, "b");
}
}