use crate::{Lexeme, Offset, Position, Span, TokenId, TOKEN_ERROR};
use regex::{escape, Regex, RegexSet};
pub use regex::Error as RegexError;
pub const UNICODE_WHITESPACE_REGEX: &str =
"[\\u0009\\u000A\\u000B\\u000C\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029]*";
#[derive(Debug, Clone)]
struct Pattern {
regex: Regex,
length: Option<usize>,
}
impl PartialEq for Pattern {
fn eq(&self, other: &Pattern) -> bool {
self.regex.as_str() == other.regex.as_str() && self.length == other.length
}
}
impl Eq for Pattern {}
#[derive(Debug, Clone)]
pub struct LexerBuilder {
whitespace: Regex,
patterns: Vec<Pattern>,
}
impl LexerBuilder {
pub fn new(whitespace_regex: &str) -> Result<LexerBuilder, RegexError> {
let mut builder = LexerBuilder {
whitespace: new_regex(whitespace_regex)?,
patterns: vec![],
};
builder.reserve_token()?; builder.reserve_token()?; builder.reserve_token()?; Ok(builder)
}
pub fn string(&mut self, constant: &str) -> Result<TokenId, RegexError> {
let pattern = Pattern {
regex: new_regex(&escape(constant))?,
length: Some(constant.len()),
};
for (existing_token, existing_pattern) in self.patterns.iter().enumerate() {
if &pattern == existing_pattern {
return Ok(existing_token);
}
}
let token = self.patterns.len();
self.patterns.push(pattern);
Ok(token)
}
pub fn regex(&mut self, regex: &str) -> Result<TokenId, RegexError> {
let pattern = Pattern {
regex: new_regex(regex)?,
length: None,
};
for (existing_token, existing_pattern) in self.patterns.iter().enumerate() {
if &pattern == existing_pattern {
return Ok(existing_token);
}
}
let token = self.patterns.len();
self.patterns.push(pattern);
Ok(token)
}
pub fn reserve_token(&mut self) -> Result<TokenId, RegexError> {
let pattern = Pattern {
regex: Regex::new("$.")?,
length: None,
};
let token = self.patterns.len();
self.patterns.push(pattern);
Ok(token)
}
pub fn finish(self) -> Result<Lexer, RegexError> {
Ok(Lexer {
whitespace: self.whitespace,
regex_set: RegexSet::new(self.patterns.iter().map(|p| p.regex.as_str()))?,
patterns: self.patterns,
})
}
}
fn new_regex(regex: &str) -> Result<Regex, RegexError> {
Regex::new(&format!("^({})", regex))
}
#[derive(Debug, Clone)]
pub struct Lexer {
whitespace: Regex,
patterns: Vec<Pattern>,
regex_set: RegexSet,
}
impl Lexer {
pub fn lex<'l, 's: 'l>(&'l self, source: &'s str) -> impl Iterator<Item = Lexeme> + 'l {
LexemeIter::new(self, source)
}
pub fn num_tokens(&self) -> usize {
self.patterns.len()
}
}
#[derive(Debug, Clone)]
struct LexemeIter<'l, 's> {
lexer: &'l Lexer,
source: &'s str,
position: Position,
offset: Offset,
}
impl<'l, 's> LexemeIter<'l, 's> {
fn new(lexer: &'l Lexer, source: &'s str) -> LexemeIter<'l, 's> {
LexemeIter {
lexer,
source,
position: Position {
line: 0,
col: 0,
utf8_col: 0,
},
offset: 0,
}
}
fn consume(&mut self, len: usize) -> Span {
let start = self.position;
for ch in self.source[..len].chars() {
self.offset += ch.len_utf8();
self.position = self.position.advance_by_char(ch);
}
let end = self.position;
self.source = &self.source[len..];
Span { start, end }
}
}
impl Iterator for LexemeIter<'_, '_> {
type Item = Lexeme;
fn next(&mut self) -> Option<Lexeme> {
if let Some(span) = self.lexer.whitespace.find(self.source) {
self.consume(span.end());
}
if self.source.is_empty() {
return None;
}
let mut best_match: Option<(TokenId, usize, bool)> = None;
for token in &self.lexer.regex_set.matches(self.source) {
let pattern = &self.lexer.patterns[token];
let (len, is_str) = if let Some(len) = pattern.length {
(len, true)
} else {
(pattern.regex.find(self.source).unwrap().end(), false)
};
let is_best_match = if let Some((_, best_len, best_is_str)) = best_match {
(len, is_str) > (best_len, best_is_str)
} else {
true
};
if is_best_match {
best_match = Some((token, len, is_str));
}
}
if let Some((token, len, _)) = best_match {
let span = self.consume(len);
return Some(Lexeme { token, span });
}
let basic_whitespace = &[' ', '\t', '\r', '\n'];
let len = self
.source
.find(basic_whitespace)
.unwrap_or(self.source.len());
let span = self.consume(len);
Some(Lexeme {
token: TOKEN_ERROR,
span,
})
}
}